Context Navigation

check_interwiki_links.py@ 1197

Last change on this file since 1197 was 1197, checked in by iritscen, 17 hours ago
ValBot: Removed TODO comment from check_interwiki_links.sh which was completed in the last commit.
File size: 13.1 KB

Line
1	# Check Interwiki Links
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or all the pages in a category) which uses a registered
4	# interwiki prefix and loads the linked page, verifying that it exists and that any section
5	# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
6	# validate the interwiki link.
7	# Recommended viewing width:
8	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---\|
9
10	import bs4
11	import pywikibot
12	import re
13	import requests # for listing members with dir() when debugging
14
15	from bs4 import BeautifulSoup
16	from pywikibot import pagegenerators
17	from pywikibot.bot import QuitKeyboardInterrupt
18	from pywikibot.comms.http import fetch
19	from pywikibot.specialbots import UploadRobot
20	from pywikibot.tools.formatter import color_format
21	from urllib.parse import urljoin
22
23	class IWLink:
24	def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response):
25	self.iw_prefix = iw_prefix # e.g. "wp"
26	self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
27	self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg"
28	self.page_name = page_name # "Easter egg"
29	self.page_slug = page_slug # "Easter_egg"
30	self.curl_response = curl_response # a class defined in the Requests library
31
32	# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35	interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=&Form=Dict1&Strategy=&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37	# Initialize globals
38	debug = 0
39	pages_checked = 0
40	iw_found = 0
41	errors_issued = 0
42	unintended_redirects_found = 0
43	name_printed = 0
44
45	# Prints the name of a page on which something occurred, if it has not been printed before
46	def possibly_print(page_name):
47	global debug
48	global name_printed
49
50	if not name_printed and not debug:
51	pywikibot.stdout('')
52	pywikibot.stdout('From page "{}":'.format(page_name))
53	name_printed = 1
54
55	# Search a page for the section specified in the link
56	def find_section(the_link, print_result):
57	global errors_issued
58
59	# Isolate section link
60	target_page_name, anchor_name = the_link.page_slug.split('#')
61	target_page_name_human = target_page_name.replace('_', ' ')
62
63	# Convert dot-notation hex entities to proper characters
64	replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
65	for pattern, replacement in replacements:
66	anchor_name = re.sub(pattern, replacement, anchor_name)
67
68	# Read linked page to see if it really has this anchor link
69	soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
70	tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
71	found_section = False
72	for tag_name in tags_to_search:
73	for the_tag in soup.find_all(tag_name):
74	if the_tag.get('id') == anchor_name:
75	found_section = True
76	break
77	if found_section:
78	break
79
80	# Tell user what we found
81	if found_section == False:
82	possibly_print(the_link.page_name)
83	pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
84	errors_issued = errors_issued + 1
85	elif print_result == True:
86	pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
87
88	# For a link that redirected us to another page, extract the name of the target page from
89	# the target page's source
90	def find_canonical_link(the_link):
91	# Extract link from this markup which contains name of redirected-to page:
92	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
93	canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
94	prefix_length = len(the_link.prefix_url)
95	canonical_name = canonical_name[prefix_length:]
96	tag_end = canonical_name.find('">')
97
98	if tag_end == -1:
99	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
100	errors_issued = errors_issued + 1
101	else:
102	canonical_name = canonical_name[:tag_end]
103	if len(canonical_name) > 100:
104	# Certain things can cause the trim to fail; report error and avoid slamming the
105	# output with massive page source from a failed trim
106	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
107	errors_issued = errors_issued + 1
108	else:
109	canonical_name = canonical_name.replace('_', ' ')
110	if '#' in the_link.page_slug:
111	_, anchor_name = the_link.page_slug.split('#')
112	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name))
113	the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages
114	find_section(the_link, True)
115	else:
116	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))
117
118	# Test an interwiki link and look for a section link if applicable
119	def test_interwiki_link(the_link):
120	global errors_issued
121	global unintended_redirects_found
122
123	the_link.curl_response = fetch(the_link.full_url)
124
125	# One way we tell that a redirect occurred is by checking fetch's history, as it
126	# automatically follows redirects. This will catch formal redirects which come from pages
127	# such as Special:PermanentLink.
128	if the_link.curl_response.history != []:
129	possibly_print(the_link.page_name)
130
131	# If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
132	if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
133	pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
134	find_canonical_link(the_link)
135	else:
136	permalink1 = 'Special:PermanentLink/'.lower()
137	permalink2 = 'Special:Permalink/'.lower()
138	page_slug_lower = the_link.page_slug.lower()
139	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
140	pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
141	find_canonical_link(the_link)
142	else:
143	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
144	errors_issued = errors_issued + 1
145	elif the_link.curl_response.status_code != 200:
146	possibly_print(the_link.page_name)
147	pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
148	errors_issued = errors_issued + 1
149	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
150	# using JavaScript, while returning code OK 200 as if the link was correct; this happens
151	# when a redirect page is accessed. We must detect these soft redirects by looking at the
152	# page source to find the redirect note inserted at the top of the page for the reader.
153	elif 'Redirected from <a' in the_link.curl_response.text:
154	unintended_redirects_found = unintended_redirects_found + 1
155	possibly_print(the_link.page_name)
156	pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
157	find_canonical_link(the_link)
158	elif '#' in the_link.page_slug:
159	find_section(the_link, False)
160
161	# Searches the given page text for interwiki links
162	def scan_for_interwiki_links(page_text, page_name):
163	global debug
164	global pages_checked
165	global iw_found
166	global name_printed
167	pages_checked = pages_checked + 1
168	cur_prefix = 0
169	name_printed = 0
170
171	for prefix in interwiki_prefixes:
172	# Isolate strings that start with "[[prefix:" and end with "\|" or "]"
173	iw_link = r"\[\[" + prefix + r":[^\|\]]*(\\|\|\])"
174	for match in re.finditer(iw_link, page_text):
175	the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "")
176
177	# Extract just the page title from this regex match
178	s = match.start() + 2 + len(the_link.iw_prefix) + 1
179	e = match.end() - 1
180
181	# Commonly we use spaces instead of underscores, so fix that before querying
182	the_link.page_slug = page_text[s:e].replace(' ', '_')
183
184	# But use spaces for title when printing it
185	page_title_human = the_link.page_slug.replace('_', ' ')
186	if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human))
187	iw_found = iw_found + 1
188
189	# Construct full URL for the particular wiki
190	the_link.full_url = the_link.prefix_url + the_link.page_slug
191
192	# Adjust URL if this is a foreign-language WP link
193	if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
194	lang_code = the_link.page_slug[0:2] + "."
195	# "wp:" is the Wikipedia: namespace, not a language
196	if lang_code != "wp." and lang_code != "WP.":
197	the_link.full_url = the_link.full_url.replace('en.', lang_code)
198	the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
199
200	# Test the URL
201	test_interwiki_link(the_link)
202	cur_prefix = cur_prefix + 1
203
204	# Print a wrap-up message
205	def print_summary():
206	global pages_checked
207	global iw_found
208	global errors_issued
209	global unintended_redirects_found
210
211	page_str = "pages"
212	if pages_checked == 1:
213	page_str = "page"
214
215	link_str = "links"
216	if iw_found == 1:
217	link_str = "link"
218
219	pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
220
221	error_str = "errors were"
222	if errors_issued == 1:
223	error_str = "error was"
224
225	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
226
227	warning_str = "likely-unintended redirects were"
228	if unintended_redirects_found == 1:
229	warning_str = "likely-unintended redirect was"
230
231	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
232
233	# Main function
234	def main(*args):
235	global debug
236	search_cat = ''
237	search_page = ''
238
239	# Process arguments
240	local_args = pywikibot.handle_args(args)
241	for arg in local_args:
242	if arg.startswith('-cat:'):
243	search_cat = arg[5:]
244	elif arg.startswith('-page:'):
245	search_page = arg[6:]
246	elif arg == '-dbg':
247	debug = 1
248	else:
249	pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
250	return
251
252	#pywikibot.stdout('The members of the requests.models.Response class are:')
253	#pywikibot.stdout(format(dir(requests.models.Response)))
254	#return
255
256	# Check specified page or loop through specified category and check all pages
257	site = pywikibot.Site()
258	if search_cat != '':
259	cat_obj = pywikibot.Category(site, search_cat)
260	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
261	for page in pagegenerators.PreloadingGenerator(generator, 100):
262	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
263	scan_for_interwiki_links(page.text, page.title())
264	elif search_page != '':
265	page = pywikibot.Page(site, search_page)
266	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
267	scan_for_interwiki_links(page.text, page.title())
268
269	# Print the results
270	print_summary()
271
272	if __name__ == '__main__':
273	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/check_interwiki_links.py@ 1197

Download in other formats: