Context Navigation

check_interwiki_links.py@ 1211

Last change on this file since 1211 was 1207, checked in by iritscen, 5 weeks ago
ValBot: Added throttle to check_interwiki_links.py to avoid Wikipedia kicking us out with error 429.
File size: 13.8 KB

Line
1	# Check Interwiki Links
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that
4	# any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link.
5	# Recommended viewing width:
6	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----\|
7
8	import bs4
9	import pywikibot
10	import re
11	import requests # for listing members with dir() when debugging
12	import time
13
14	from bs4 import BeautifulSoup
15	from pywikibot import pagegenerators
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot.comms.http import fetch
18	from pywikibot.specialbots import UploadRobot
19	from urllib.parse import urljoin
20
21	class IWLink:
22	def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response):
23	self.iw_prefix = iw_prefix # e.g. "wp" as in [[wp:Marathon (series)#Rampancy]]
24	self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
25	self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy"
26	self.page_name = page_name # "Marathon (series)#Rampancy"
27	self.page_name_only = page_name # "Marathon (series)"
28	self.page_slug = page_slug # "Marathon_(series)#Rampancy"
29	self.hosting_page = hosting_page # "Easter eggs"; page where the link was found
30	self.curl_response = curl_response # a class defined in the Requests library
31
32	# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35	interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=&Form=Dict1&Strategy=&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37	# Initialize globals
38	debug = 0
39	pages_checked = 0
40	iw_found = 0
41	errors_issued = 0
42	unintended_redirects_found = 0
43	name_printed = 0
44	request_delay = 1.5
45	max_retries = 3
46	backoff_factor = 2
47
48	# Prints the name of a page on which something occurred, if it has not been printed before
49	def possibly_print(the_link):
50	global debug
51	global name_printed
52
53	if not name_printed and not debug:
54	pywikibot.stdout('')
55	pywikibot.stdout('From page "{}":'.format(the_link.hosting_page))
56	name_printed = 1
57
58	# Search a page for the section specified in the link
59	def find_section(the_link, print_result):
60	global errors_issued
61
62	# Isolate section link
63	_, anchor_name = the_link.page_slug.split('#')
64
65	# Convert dot-notation hex entities to proper characters
66	replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
67	for pattern, replacement in replacements:
68	anchor_name = re.sub(pattern, replacement, anchor_name)
69
70	# Read linked page to see if it really has this anchor link
71	soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
72	tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
73	found_section = False
74	for tag_name in tags_to_search:
75	for the_tag in soup.find_all(tag_name):
76	if the_tag.get('id') == anchor_name:
77	found_section = True
78	break
79	if found_section:
80	break
81
82	# Tell user what we found
83	if found_section == False:
84	possibly_print(the_link)
85	pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
86	errors_issued = errors_issued + 1
87	elif print_result == True:
88	pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
89
90	# For a link that redirected us to another page, extract the name of the target page from the target page's source
91	def find_canonical_link(the_link):
92	# Extract link from this markup which contains name of redirected-to page:
93	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
94	canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
95	prefix_length = len(the_link.prefix_url)
96	canonical_name = canonical_name[prefix_length:]
97	tag_end = canonical_name.find('">')
98
99	if tag_end == -1:
100	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
101	errors_issued = errors_issued + 1
102	else:
103	canonical_name = canonical_name[:tag_end]
104	if len(canonical_name) > 100:
105	# Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim
106	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
107	errors_issued = errors_issued + 1
108	else:
109	the_link.page_name = canonical_name.replace('_', ' ')
110	if '#' in the_link.page_slug:
111	the_link.page_name_only, _ = the_link.page_slug.split('#')
112	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name))
113	find_section(the_link, True)
114	else:
115	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name))
116
117	# Test an interwiki link and look for a section link if applicable
118	def test_interwiki_link(the_link):
119	global errors_issued
120	global unintended_redirects_found
121
122	# We have to carefully throttle requests because otherwise we will get hit with a 429: Too Many Requests
123	attempt = 0
124	delay = request_delay
125	while True:
126	time.sleep(delay)
127
128	the_link.curl_response = fetch(the_link.full_url)
129
130	if the_link.curl_response.status_code != 429:
131	break
132
133	attempt += 1
134	if attempt > max_retries:
135	pywikibot.stdout(f' ERROR: Maximum retries afer error 429 exceeded for "{the_link.page_slug}". Aborting script.')
136	raise SystemExit(1)
137
138	# Increase rate limit if we got the error
139	delay *= backoff_factor
140	pywikibot.stdout(f' WARNING: Received error 429 for "{the_link.page_slug}". Retrying in {delay:.1f}s...')
141
142	# One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from
143	# pages such as Special:PermanentLink.
144	if the_link.curl_response.history != []:
145	possibly_print(the_link)
146
147	# If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
148	if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
149	pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
150	find_canonical_link(the_link)
151	else:
152	permalink1 = 'Special:PermanentLink/'.lower()
153	permalink2 = 'Special:Permalink/'.lower()
154	page_slug_lower = the_link.page_slug.lower()
155	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
156	pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
157	find_canonical_link(the_link)
158	else:
159	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
160	errors_issued = errors_issued + 1
161	elif the_link.curl_response.status_code != 200:
162	possibly_print(the_link)
163	pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
164	errors_issued = errors_issued + 1
165	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this
166	# happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the
167	# page for the reader.
168	elif 'Redirected from <a' in the_link.curl_response.text:
169	unintended_redirects_found = unintended_redirects_found + 1
170	possibly_print(the_link)
171	pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
172	find_canonical_link(the_link) # calls find_section() at end
173	elif '#' in the_link.page_slug:
174	find_section(the_link, False)
175
176	# Searches the given page text for interwiki links
177	def scan_for_interwiki_links(page_text, page_name):
178	global debug
179	global pages_checked
180	global iw_found
181	global name_printed
182	pages_checked = pages_checked + 1
183	cur_prefix = 0
184	name_printed = 0
185
186	for prefix in interwiki_prefixes:
187	# Isolate strings that start with "[[prefix:" and end with "\|" or "]"
188	iw_link = r"\[\[" + prefix + r":[^\|\]]*(\\|\|\])"
189	for match in re.finditer(iw_link, page_text):
190	the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", "", "", "", page_name, "")
191
192	# Extract just the page title from this regex match
193	s = match.start() + 2 + len(the_link.iw_prefix) + 1
194	e = match.end() - 1
195
196	# Use underscores in slug used to constructed URL, but retain spaces for printable name
197	the_link.page_slug = page_text[s:e].replace(' ', '_')
198	the_link.page_name = page_text[s:e]
199	if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, the_link.page_name))
200	iw_found = iw_found + 1
201
202	# Construct full URL for the particular wiki
203	the_link.full_url = the_link.prefix_url + the_link.page_slug
204
205	# Adjust URL if this is a foreign-language WP link
206	if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
207	lang_code = the_link.page_slug[0:2] + "."
208	# "wp:" is the Wikipedia: namespace, not a language
209	if lang_code != "wp." and lang_code != "WP.":
210	the_link.full_url = the_link.full_url.replace('en.', lang_code)
211	the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
212
213	# Test the URL
214	test_interwiki_link(the_link)
215	cur_prefix = cur_prefix + 1
216
217	# Print a wrap-up message
218	def print_summary():
219	global pages_checked
220	global iw_found
221	global errors_issued
222	global unintended_redirects_found
223
224	page_str = "pages"
225	if pages_checked == 1:
226	page_str = "page"
227
228	link_str = "links"
229	if iw_found == 1:
230	link_str = "link"
231
232	pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
233
234	error_str = "errors were"
235	if errors_issued == 1:
236	error_str = "error was"
237
238	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
239
240	warning_str = "likely-unintended redirects were"
241	if unintended_redirects_found == 1:
242	warning_str = "likely-unintended redirect was"
243
244	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
245
246	# Main function
247	def main(*args):
248	global debug
249	search_cat = ''
250	search_page = ''
251
252	# Process arguments
253	local_args = pywikibot.handle_args(args)
254	for arg in local_args:
255	if arg.startswith('-cat:'):
256	search_cat = arg[5:]
257	elif arg.startswith('-page:'):
258	search_page = arg[6:]
259	elif arg == '-dbg':
260	debug = 1
261	else:
262	pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
263	return
264
265	#pywikibot.stdout('The members of the requests.models.Response class are:')
266	#pywikibot.stdout(format(dir(requests.models.Response)))
267	#return
268
269	# Check specified page or loop through specified category and check all pages
270	site = pywikibot.Site()
271	if search_cat != '':
272	cat_obj = pywikibot.Category(site, search_cat)
273	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
274	for page in pagegenerators.PreloadingGenerator(generator, 100):
275	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
276	scan_for_interwiki_links(page.text, page.title())
277	elif search_page != '':
278	page = pywikibot.Page(site, search_page)
279	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
280	scan_for_interwiki_links(page.text, page.title())
281
282	# Print the results
283	print_summary()
284
285	if __name__ == '__main__':
286	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/check_interwiki_links.py@ 1211

Download in other formats: