Context Navigation

check_interwiki_links.py@ 1196

Last change on this file since 1196 was 1196, checked in by iritscen, 27 hours ago
ValBot: check_interwiki_links.sh now tallies and more clearly marks redirects that are probably not intended. Redirect target page is now correctly stated in one message about redirects. Streamlined code somewhat.
File size: 13.2 KB

Line
1	# Check Interwiki Links
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or all the pages in a category) which uses a registered
4	# interwiki prefix and loads the linked page, verifying that it exists and that any section
5	# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
6	# validate the interwiki link.
7	# Recommended viewing width:
8	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---\|
9
10	import bs4
11	import pywikibot
12	import re
13	import requests # for listing members with dir() when debugging
14
15	from bs4 import BeautifulSoup
16	from pywikibot import pagegenerators
17	from pywikibot.bot import QuitKeyboardInterrupt
18	from pywikibot.comms.http import fetch
19	from pywikibot.specialbots import UploadRobot
20	from pywikibot.tools.formatter import color_format
21	from urllib.parse import urljoin
22
23	class IWLink:
24	def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response):
25	self.iw_prefix = iw_prefix # e.g. "wp"
26	self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
27	self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg"
28	self.page_name = page_name # "Easter egg"
29	self.page_slug = page_slug # "Easter_egg"
30	self.curl_response = curl_response # a class defined in the Requests library
31
32	# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35	interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=&Form=Dict1&Strategy=&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37	# Initialize globals
38	debug = 0
39	pages_checked = 0
40	iw_found = 0
41	errors_issued = 0
42	unintended_redirects_found = 0
43	name_printed = 0
44
45	# Prints the name of a page on which something occurred, if it has not been printed before
46	def possibly_print(page_name):
47	global debug
48	global name_printed
49
50	if not name_printed and not debug:
51	pywikibot.stdout('')
52	pywikibot.stdout('From page "{}":'.format(page_name))
53	name_printed = 1
54
55	# Search a page for the section specified in the link
56	def find_section(the_link, print_result):
57	global errors_issued
58
59	# Isolate section link
60	target_page_name, anchor_name = the_link.page_slug.split('#')
61	target_page_name_human = target_page_name.replace('_', ' ')
62
63	# Convert dot-notation hex entities to proper characters
64	replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
65	for pattern, replacement in replacements:
66	anchor_name = re.sub(pattern, replacement, anchor_name)
67
68	# Read linked page to see if it really has this anchor link
69	soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
70	tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
71	found_section = False
72	for tag_name in tags_to_search:
73	for the_tag in soup.find_all(tag_name):
74	if the_tag.get('id') == anchor_name:
75	found_section = True
76	break
77	if found_section:
78	break
79
80	# Tell user what we found
81	if found_section == False:
82	possibly_print(the_link.page_name)
83	pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
84	# TODO: Check that page name has been corrected to redirected page if there was a redirect
85	errors_issued = errors_issued + 1
86	elif print_result == True:
87	pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
88
89	# For a link that redirected us to another page, extract the name of the target page from
90	# the target page's source
91	def find_canonical_link(the_link):
92	# Extract link from this markup which contains name of redirected-to page:
93	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
94	canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
95	prefix_length = len(the_link.prefix_url)
96	canonical_name = canonical_name[prefix_length:]
97	tag_end = canonical_name.find('">')
98
99	if tag_end == -1:
100	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
101	errors_issued = errors_issued + 1
102	else:
103	canonical_name = canonical_name[:tag_end]
104	if len(canonical_name) > 100:
105	# Certain things can cause the trim to fail; report error and avoid slamming the
106	# output with massive page source from a failed trim
107	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
108	errors_issued = errors_issued + 1
109	else:
110	canonical_name = canonical_name.replace('_', ' ')
111	if '#' in the_link.page_slug:
112	_, anchor_name = the_link.page_slug.split('#')
113	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name))
114	the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages
115	find_section(the_link, True)
116	else:
117	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))
118
119	# Test an interwiki link and look for a section link if applicable
120	def test_interwiki_link(the_link):
121	global errors_issued
122	global unintended_redirects_found
123
124	the_link.curl_response = fetch(the_link.full_url)
125
126	# One way we tell that a redirect occurred is by checking fetch's history, as it
127	# automatically follows redirects. This will catch formal redirects which come from pages
128	# such as Special:PermanentLink.
129	if the_link.curl_response.history != []:
130	possibly_print(the_link.page_name)
131
132	# If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
133	if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
134	pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
135	find_canonical_link(the_link)
136	else:
137	permalink1 = 'Special:PermanentLink/'.lower()
138	permalink2 = 'Special:Permalink/'.lower()
139	page_slug_lower = the_link.page_slug.lower()
140	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
141	pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
142	find_canonical_link(the_link)
143	else:
144	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
145	errors_issued = errors_issued + 1
146	elif the_link.curl_response.status_code != 200:
147	possibly_print(the_link.page_name)
148	pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
149	errors_issued = errors_issued + 1
150	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
151	# using JavaScript, while returning code OK 200 as if the link was correct; this happens
152	# when a redirect page is accessed. We must detect these soft redirects by looking at the
153	# page source to find the redirect note inserted at the top of the page for the reader.
154	elif 'Redirected from <a' in the_link.curl_response.text:
155	unintended_redirects_found = unintended_redirects_found + 1
156	possibly_print(the_link.page_name)
157	pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
158	find_canonical_link(the_link)
159	elif '#' in the_link.page_slug:
160	find_section(the_link, False)
161
162	# Searches the given page text for interwiki links
163	def scan_for_interwiki_links(page_text, page_name):
164	global debug
165	global pages_checked
166	global iw_found
167	global name_printed
168	pages_checked = pages_checked + 1
169	cur_prefix = 0
170	name_printed = 0
171
172	for prefix in interwiki_prefixes:
173	# Isolate strings that start with "[[prefix:" and end with "\|" or "]"
174	iw_link = r"\[\[" + prefix + r":[^\|\]]*(\\|\|\])"
175	for match in re.finditer(iw_link, page_text):
176	the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "")
177
178	# Extract just the page title from this regex match
179	s = match.start() + 2 + len(the_link.iw_prefix) + 1
180	e = match.end() - 1
181
182	# Commonly we use spaces instead of underscores, so fix that before querying
183	the_link.page_slug = page_text[s:e].replace(' ', '_')
184
185	# But use spaces for title when printing it
186	page_title_human = the_link.page_slug.replace('_', ' ')
187	if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human))
188	iw_found = iw_found + 1
189
190	# Construct full URL for the particular wiki
191	the_link.full_url = the_link.prefix_url + the_link.page_slug
192
193	# Adjust URL if this is a foreign-language WP link
194	if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
195	lang_code = the_link.page_slug[0:2] + "."
196	# "wp:" is the Wikipedia: namespace, not a language
197	if lang_code != "wp." and lang_code != "WP.":
198	the_link.full_url = the_link.full_url.replace('en.', lang_code)
199	the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
200
201	# Test the URL
202	test_interwiki_link(the_link)
203	cur_prefix = cur_prefix + 1
204
205	# Print a wrap-up message
206	def print_summary():
207	global pages_checked
208	global iw_found
209	global errors_issued
210	global unintended_redirects_found
211
212	page_str = "pages"
213	if pages_checked == 1:
214	page_str = "page"
215
216	link_str = "links"
217	if iw_found == 1:
218	link_str = "link"
219
220	pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
221
222	error_str = "errors were"
223	if errors_issued == 1:
224	error_str = "error was"
225
226	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
227
228	warning_str = "likely-unintended redirects were"
229	if unintended_redirects_found == 1:
230	warning_str = "likely-unintended redirect was"
231
232	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
233
234	# Main function
235	def main(*args):
236	global debug
237	search_cat = ''
238	search_page = ''
239
240	# Process arguments
241	local_args = pywikibot.handle_args(args)
242	for arg in local_args:
243	if arg.startswith('-cat:'):
244	search_cat = arg[5:]
245	elif arg.startswith('-page:'):
246	search_page = arg[6:]
247	elif arg == '-dbg':
248	debug = 1
249	else:
250	pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
251	return
252
253	#pywikibot.stdout('The members of the requests.models.Response class are:')
254	#pywikibot.stdout(format(dir(requests.models.Response)))
255	#return
256
257	# Check specified page or loop through specified category and check all pages
258	site = pywikibot.Site()
259	if search_cat != '':
260	cat_obj = pywikibot.Category(site, search_cat)
261	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
262	for page in pagegenerators.PreloadingGenerator(generator, 100):
263	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
264	scan_for_interwiki_links(page.text, page.title())
265	elif search_page != '':
266	page = pywikibot.Page(site, search_page)
267	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
268	scan_for_interwiki_links(page.text, page.title())
269
270	# Print the results
271	print_summary()
272
273	if __name__ == '__main__':
274	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/check_interwiki_links.py@ 1196

Download in other formats: