Context Navigation

check_interwiki_links.py@ 1200

Last change on this file since 1200 was 1200, checked in by iritscen, 3 days ago
Reverted last commit now that test is done.
File size: 13.0 KB

Line
1	# Check Interwiki Links
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that
4	# any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link.
5	# Recommended viewing width:
6	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----\|
7
8	import bs4
9	import pywikibot
10	import re
11	import requests # for listing members with dir() when debugging
12
13	from bs4 import BeautifulSoup
14	from pywikibot import pagegenerators
15	from pywikibot.bot import QuitKeyboardInterrupt
16	from pywikibot.comms.http import fetch
17	from pywikibot.specialbots import UploadRobot
18	from pywikibot.tools.formatter import color_format
19	from urllib.parse import urljoin
20
21	class IWLink:
22	def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response):
23	self.iw_prefix = iw_prefix # e.g. "wp"
24	self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
25	self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy"
26	self.page_name = page_name # "Marathon (series)#Rampancy"
27	self.page_name_only = page_name # "Marathon (series)"
28	self.page_slug = page_slug # "Marathon_(series)#Rampancy"
29	self.hosting_page = hosting_page # "Easter eggs"; page where the link was found
30	self.curl_response = curl_response # a class defined in the Requests library
31
32	# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35	interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=&Form=Dict1&Strategy=&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37	# Initialize globals
38	debug = 0
39	pages_checked = 0
40	iw_found = 0
41	errors_issued = 0
42	unintended_redirects_found = 0
43	name_printed = 0
44
45	# Prints the name of a page on which something occurred, if it has not been printed before
46	def possibly_print(the_link):
47	global debug
48	global name_printed
49
50	if not name_printed and not debug:
51	pywikibot.stdout('')
52	pywikibot.stdout('From page "{}":'.format(the_link.hosting_page))
53	name_printed = 1
54
55	# Search a page for the section specified in the link
56	def find_section(the_link, print_result):
57	global errors_issued
58
59	# Isolate section link
60	_, anchor_name = the_link.page_slug.split('#')
61
62	# Convert dot-notation hex entities to proper characters
63	replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
64	for pattern, replacement in replacements:
65	anchor_name = re.sub(pattern, replacement, anchor_name)
66
67	# Read linked page to see if it really has this anchor link
68	soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
69	tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
70	found_section = False
71	for tag_name in tags_to_search:
72	for the_tag in soup.find_all(tag_name):
73	if the_tag.get('id') == anchor_name:
74	found_section = True
75	break
76	if found_section:
77	break
78
79	# Tell user what we found
80	if found_section == False:
81	possibly_print(the_link)
82	pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
83	errors_issued = errors_issued + 1
84	elif print_result == True:
85	pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
86
87	# For a link that redirected us to another page, extract the name of the target page from the target page's source
88	def find_canonical_link(the_link):
89	# Extract link from this markup which contains name of redirected-to page:
90	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
91	canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
92	prefix_length = len(the_link.prefix_url)
93	canonical_name = canonical_name[prefix_length:]
94	tag_end = canonical_name.find('">')
95
96	if tag_end == -1:
97	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
98	errors_issued = errors_issued + 1
99	else:
100	canonical_name = canonical_name[:tag_end]
101	if len(canonical_name) > 100:
102	# Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim
103	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
104	errors_issued = errors_issued + 1
105	else:
106	the_link.page_name = canonical_name.replace('_', ' ')
107	if '#' in the_link.page_slug:
108	the_link.page_name_only, _ = the_link.page_slug.split('#')
109	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name))
110	find_section(the_link, True)
111	else:
112	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name))
113
114	# Test an interwiki link and look for a section link if applicable
115	def test_interwiki_link(the_link):
116	global errors_issued
117	global unintended_redirects_found
118
119	the_link.curl_response = fetch(the_link.full_url)
120
121	# One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from
122	# pages such as Special:PermanentLink.
123	if the_link.curl_response.history != []:
124	possibly_print(the_link)
125
126	# If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
127	if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
128	pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
129	find_canonical_link(the_link)
130	else:
131	permalink1 = 'Special:PermanentLink/'.lower()
132	permalink2 = 'Special:Permalink/'.lower()
133	page_slug_lower = the_link.page_slug.lower()
134	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
135	pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
136	find_canonical_link(the_link)
137	else:
138	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
139	errors_issued = errors_issued + 1
140	elif the_link.curl_response.status_code != 200:
141	possibly_print(the_link)
142	pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
143	errors_issued = errors_issued + 1
144	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this
145	# happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the
146	# page for the reader.
147	elif 'Redirected from <a' in the_link.curl_response.text:
148	unintended_redirects_found = unintended_redirects_found + 1
149	possibly_print(the_link)
150	pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
151	find_canonical_link(the_link) # calls find_section() at end
152	elif '#' in the_link.page_slug:
153	find_section(the_link, False)
154
155	# Searches the given page text for interwiki links
156	def scan_for_interwiki_links(page_text, page_name):
157	global debug
158	global pages_checked
159	global iw_found
160	global name_printed
161	pages_checked = pages_checked + 1
162	cur_prefix = 0
163	name_printed = 0
164
165	for prefix in interwiki_prefixes:
166	# Isolate strings that start with "[[prefix:" and end with "\|" or "]"
167	iw_link = r"\[\[" + prefix + r":[^\|\]]*(\\|\|\])"
168	for match in re.finditer(iw_link, page_text):
169	the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", "", "", "", page_name, "")
170
171	# Extract just the page title from this regex match
172	s = match.start() + 2 + len(the_link.iw_prefix) + 1
173	e = match.end() - 1
174
175	# Use underscores in slug used to constructed URL, but retain spaces for printable name
176	the_link.page_slug = page_text[s:e].replace(' ', '_')
177	the_link.page_name = page_text[s:e]
178	if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, the_link.page_name))
179	iw_found = iw_found + 1
180
181	# Construct full URL for the particular wiki
182	the_link.full_url = the_link.prefix_url + the_link.page_slug
183
184	# Adjust URL if this is a foreign-language WP link
185	if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
186	lang_code = the_link.page_slug[0:2] + "."
187	# "wp:" is the Wikipedia: namespace, not a language
188	if lang_code != "wp." and lang_code != "WP.":
189	the_link.full_url = the_link.full_url.replace('en.', lang_code)
190	the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
191
192	# Test the URL
193	test_interwiki_link(the_link)
194	cur_prefix = cur_prefix + 1
195
196	# Print a wrap-up message
197	def print_summary():
198	global pages_checked
199	global iw_found
200	global errors_issued
201	global unintended_redirects_found
202
203	page_str = "pages"
204	if pages_checked == 1:
205	page_str = "page"
206
207	link_str = "links"
208	if iw_found == 1:
209	link_str = "link"
210
211	pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
212
213	error_str = "errors were"
214	if errors_issued == 1:
215	error_str = "error was"
216
217	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
218
219	warning_str = "likely-unintended redirects were"
220	if unintended_redirects_found == 1:
221	warning_str = "likely-unintended redirect was"
222
223	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
224
225	# Main function
226	def main(*args):
227	global debug
228	search_cat = ''
229	search_page = ''
230
231	# Process arguments
232	local_args = pywikibot.handle_args(args)
233	for arg in local_args:
234	if arg.startswith('-cat:'):
235	search_cat = arg[5:]
236	elif arg.startswith('-page:'):
237	search_page = arg[6:]
238	elif arg == '-dbg':
239	debug = 1
240	else:
241	pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
242	return
243
244	#pywikibot.stdout('The members of the requests.models.Response class are:')
245	#pywikibot.stdout(format(dir(requests.models.Response)))
246	#return
247
248	# Check specified page or loop through specified category and check all pages
249	site = pywikibot.Site()
250	if search_cat != '':
251	cat_obj = pywikibot.Category(site, search_cat)
252	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
253	for page in pagegenerators.PreloadingGenerator(generator, 100):
254	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
255	scan_for_interwiki_links(page.text, page.title())
256	elif search_page != '':
257	page = pywikibot.Page(site, search_page)
258	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
259	scan_for_interwiki_links(page.text, page.title())
260
261	# Print the results
262	print_summary()
263
264	if __name__ == '__main__':
265	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/check_interwiki_links.py@ 1200

Download in other formats: