Context Navigation

source: ValBot/Python/check_interwiki_links.py@ 1205

Last change on this file since 1205 was 1204, checked in by iritscen, 3 months ago
ValBot: Removed import that was causing trouble with current version of Pywikibot; it wasn't being used anyway.
File size: 13.0 KB

Line
1	# Check Interwiki Links
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that
4	# any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link.
5	# Recommended viewing width:
6	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----\|
7
8	import bs4
9	import pywikibot
10	import re
11	import requests # for listing members with dir() when debugging
12
13	from bs4 import BeautifulSoup
14	from pywikibot import pagegenerators
15	from pywikibot.bot import QuitKeyboardInterrupt
16	from pywikibot.comms.http import fetch
17	from pywikibot.specialbots import UploadRobot
18	from urllib.parse import urljoin
19
20	class IWLink:
21	def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response):
22	self.iw_prefix = iw_prefix # e.g. "wp"
23	self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
24	self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy"
25	self.page_name = page_name # "Marathon (series)#Rampancy"
26	self.page_name_only = page_name # "Marathon (series)"
27	self.page_slug = page_slug # "Marathon_(series)#Rampancy"
28	self.hosting_page = hosting_page # "Easter eggs"; page where the link was found
29	self.curl_response = curl_response # a class defined in the Requests library
30
31	# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
32	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
33
34	interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=&Form=Dict1&Strategy=&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
35
36	# Initialize globals
37	debug = 0
38	pages_checked = 0
39	iw_found = 0
40	errors_issued = 0
41	unintended_redirects_found = 0
42	name_printed = 0
43
44	# Prints the name of a page on which something occurred, if it has not been printed before
45	def possibly_print(the_link):
46	global debug
47	global name_printed
48
49	if not name_printed and not debug:
50	pywikibot.stdout('')
51	pywikibot.stdout('From page "{}":'.format(the_link.hosting_page))
52	name_printed = 1
53
54	# Search a page for the section specified in the link
55	def find_section(the_link, print_result):
56	global errors_issued
57
58	# Isolate section link
59	_, anchor_name = the_link.page_slug.split('#')
60
61	# Convert dot-notation hex entities to proper characters
62	replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
63	for pattern, replacement in replacements:
64	anchor_name = re.sub(pattern, replacement, anchor_name)
65
66	# Read linked page to see if it really has this anchor link
67	soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
68	tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
69	found_section = False
70	for tag_name in tags_to_search:
71	for the_tag in soup.find_all(tag_name):
72	if the_tag.get('id') == anchor_name:
73	found_section = True
74	break
75	if found_section:
76	break
77
78	# Tell user what we found
79	if found_section == False:
80	possibly_print(the_link)
81	pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
82	errors_issued = errors_issued + 1
83	elif print_result == True:
84	pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
85
86	# For a link that redirected us to another page, extract the name of the target page from the target page's source
87	def find_canonical_link(the_link):
88	# Extract link from this markup which contains name of redirected-to page:
89	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
90	canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
91	prefix_length = len(the_link.prefix_url)
92	canonical_name = canonical_name[prefix_length:]
93	tag_end = canonical_name.find('">')
94
95	if tag_end == -1:
96	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
97	errors_issued = errors_issued + 1
98	else:
99	canonical_name = canonical_name[:tag_end]
100	if len(canonical_name) > 100:
101	# Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim
102	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
103	errors_issued = errors_issued + 1
104	else:
105	the_link.page_name = canonical_name.replace('_', ' ')
106	if '#' in the_link.page_slug:
107	the_link.page_name_only, _ = the_link.page_slug.split('#')
108	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name))
109	find_section(the_link, True)
110	else:
111	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name))
112
113	# Test an interwiki link and look for a section link if applicable
114	def test_interwiki_link(the_link):
115	global errors_issued
116	global unintended_redirects_found
117
118	the_link.curl_response = fetch(the_link.full_url)
119
120	# One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from
121	# pages such as Special:PermanentLink.
122	if the_link.curl_response.history != []:
123	possibly_print(the_link)
124
125	# If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
126	if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
127	pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
128	find_canonical_link(the_link)
129	else:
130	permalink1 = 'Special:PermanentLink/'.lower()
131	permalink2 = 'Special:Permalink/'.lower()
132	page_slug_lower = the_link.page_slug.lower()
133	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
134	pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
135	find_canonical_link(the_link)
136	else:
137	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
138	errors_issued = errors_issued + 1
139	elif the_link.curl_response.status_code != 200:
140	possibly_print(the_link)
141	pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
142	errors_issued = errors_issued + 1
143	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this
144	# happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the
145	# page for the reader.
146	elif 'Redirected from <a' in the_link.curl_response.text:
147	unintended_redirects_found = unintended_redirects_found + 1
148	possibly_print(the_link)
149	pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
150	find_canonical_link(the_link) # calls find_section() at end
151	elif '#' in the_link.page_slug:
152	find_section(the_link, False)
153
154	# Searches the given page text for interwiki links
155	def scan_for_interwiki_links(page_text, page_name):
156	global debug
157	global pages_checked
158	global iw_found
159	global name_printed
160	pages_checked = pages_checked + 1
161	cur_prefix = 0
162	name_printed = 0
163
164	for prefix in interwiki_prefixes:
165	# Isolate strings that start with "[[prefix:" and end with "\|" or "]"
166	iw_link = r"\[\[" + prefix + r":[^\|\]]*(\\|\|\])"
167	for match in re.finditer(iw_link, page_text):
168	the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", "", "", "", page_name, "")
169
170	# Extract just the page title from this regex match
171	s = match.start() + 2 + len(the_link.iw_prefix) + 1
172	e = match.end() - 1
173
174	# Use underscores in slug used to constructed URL, but retain spaces for printable name
175	the_link.page_slug = page_text[s:e].replace(' ', '_')
176	the_link.page_name = page_text[s:e]
177	if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, the_link.page_name))
178	iw_found = iw_found + 1
179
180	# Construct full URL for the particular wiki
181	the_link.full_url = the_link.prefix_url + the_link.page_slug
182
183	# Adjust URL if this is a foreign-language WP link
184	if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
185	lang_code = the_link.page_slug[0:2] + "."
186	# "wp:" is the Wikipedia: namespace, not a language
187	if lang_code != "wp." and lang_code != "WP.":
188	the_link.full_url = the_link.full_url.replace('en.', lang_code)
189	the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
190
191	# Test the URL
192	test_interwiki_link(the_link)
193	cur_prefix = cur_prefix + 1
194
195	# Print a wrap-up message
196	def print_summary():
197	global pages_checked
198	global iw_found
199	global errors_issued
200	global unintended_redirects_found
201
202	page_str = "pages"
203	if pages_checked == 1:
204	page_str = "page"
205
206	link_str = "links"
207	if iw_found == 1:
208	link_str = "link"
209
210	pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
211
212	error_str = "errors were"
213	if errors_issued == 1:
214	error_str = "error was"
215
216	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
217
218	warning_str = "likely-unintended redirects were"
219	if unintended_redirects_found == 1:
220	warning_str = "likely-unintended redirect was"
221
222	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
223
224	# Main function
225	def main(*args):
226	global debug
227	search_cat = ''
228	search_page = ''
229
230	# Process arguments
231	local_args = pywikibot.handle_args(args)
232	for arg in local_args:
233	if arg.startswith('-cat:'):
234	search_cat = arg[5:]
235	elif arg.startswith('-page:'):
236	search_page = arg[6:]
237	elif arg == '-dbg':
238	debug = 1
239	else:
240	pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
241	return
242
243	#pywikibot.stdout('The members of the requests.models.Response class are:')
244	#pywikibot.stdout(format(dir(requests.models.Response)))
245	#return
246
247	# Check specified page or loop through specified category and check all pages
248	site = pywikibot.Site()
249	if search_cat != '':
250	cat_obj = pywikibot.Category(site, search_cat)
251	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
252	for page in pagegenerators.PreloadingGenerator(generator, 100):
253	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
254	scan_for_interwiki_links(page.text, page.title())
255	elif search_page != '':
256	page = pywikibot.Page(site, search_page)
257	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
258	scan_for_interwiki_links(page.text, page.title())
259
260	# Print the results
261	print_summary()
262
263	if __name__ == '__main__':
264	main()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: