Context Navigation

check_intrawiki_section_links.py@ 1211

Last change on this file since 1211 was 1205, checked in by iritscen, 3 months ago
ValBot: Revised logic in check_intrawiki_section_links.py as MediaWiki now apparently returns response 301 when the user is redirected by a redirect page.
File size: 19.2 KB

Line
1	# Check Intrawiki Section Links
2	# by iritscen@yahoo.com
3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4	# and loads the linked page and verifies that the named section actually exists. It also
5	# understands section links generated through a call to Template:SectionLink.
6	# Recommended viewing width:
7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
8
9	import os
10
11	from urllib.parse import urljoin
12
13	import pywikibot
14	import re
15
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot import pagegenerators
18	from pywikibot.comms.http import fetch
19	from pywikibot.specialbots import UploadRobot
20	from bs4 import BeautifulSoup
21
22	# Tuple of OniGalore's namespaces
23	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
24
25	# URL for main namespace of our wiki
26	onigalore_url = 'https://wiki.oni2.net/'
27
28	# Tuple of interwiki prefixes, for recognizing and passing over such links
29	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
30
31	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
32	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
33
34	# Tuple of patterns for recognizing wikilinks
35	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
36	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
37	link_patterns = (r"\[\[[^\|\]](\\|\|\])", r"\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
38
39	# Initialize globals
40	debug = 0
41	pages_checked = 0
42	iw_found = 0
43	advice_issued = 0
44	errors_issued = 0
45	name_printed = 0
46
47	# Prints the name of a page on which something occurred, if it has not been printed before
48	def possibly_print(page_name):
49	global debug
50	global name_printed
51
52	if not name_printed and not debug:
53	pywikibot.stdout('')
54	pywikibot.stdout('From page "{}":'.format(page_name))
55	name_printed = 1
56
57	# Search a page for the section specified in the link
58	def find_section(page_text, page_name, page_slug, print_result):
59	global errors_issued
60	found_section = False
61
62	# Isolate section link or text fragment link
63	target_page_name, anchor_name = page_slug.split('#', 1)
64	target_page_name_human = target_page_name.replace('_', ' ')
65
66	# First check if this is a text fragment directive, and look for it if so
67	if anchor_name.startswith(':~:text='):
68	if debug: pywikibot.stdout(' Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug))
69	anchor_name = anchor_name[8:]
70	# We're only checking the first text directive, so strip add'l ones if present
71	addl_fragment = anchor_name.find('&text=')
72	if addl_fragment != -1:
73	anchor_name = anchor_name[:addl_fragment]
74	search_terms = anchor_name.split(',')
75	# Delete prefix and suffix terms because they aren't needed
76	if search_terms[0].endswith('-'):
77	search_terms.pop(0)
78	if search_terms[-1].startswith('-'):
79	search_terms.pop()
80	# Remake text directive with the terms separated by spaces as they should be in the page
81	# text
82	newSep = ' '
83	search_string = newSep.join(search_terms)
84	if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string))
85	if search_string in page_text:
86	found_section = True
87	if debug and not print_result: pywikibot.stdout(' Found text fragment!')
88
89	# If we're still here, it's a section link; read linked page to see if it really has this
90	# anchor link
91	if found_section == False:
92	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
93	soup = BeautifulSoup(page_text, 'html.parser')
94	# Search for a span with this ID
95	for span_tag in soup.findAll('span'):
96	span_name = span_tag.get('id', None)
97	if span_name == anchor_name:
98	if debug and not print_result: pywikibot.stdout(' Found section in a span!')
99	found_section = True
100	break
101	if found_section == False:
102	# Search for a div with this ID
103	for span_tag in soup.findAll('div'):
104	span_name = span_tag.get('id', None)
105	if span_name == anchor_name:
106	if debug and not print_result: pywikibot.stdout(' Found section in a div!')
107	found_section = True
108	break
109	if found_section == False:
110	possibly_print(page_name)
111	pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
112	errors_issued += 1
113	elif debug and print_result:
114	pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
115
116	# For a link that redirected us to another page, extract the name of the target page from the
117	# target page's source
118	def find_canonical_link(page_text, page_name, page_slug):
119	# Extract link from this markup which contains name of redirected-to page:
120	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
121	# "wgPageName":"Namespace:Page_name",
122	canonical_name = page_text.split('"wgPageName":"')[-1]
123	tag_end = canonical_name.find('",')
124
125	if tag_end == -1:
126	pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
127	errors_issued = errors_issued + 1
128	else:
129	canonical_name = canonical_name[:tag_end]
130	if len(canonical_name) > 100:
131	# Certain things can cause the trim to fail; report error and avoid slamming the output
132	# with massive page source from a failed trim
133	pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
134	errors_issued = errors_issued + 1
135	else:
136	canonical_name = canonical_name.replace('_', ' ')
137	if '#' in page_slug:
138	_, anchor_name = page_slug.split('#')
139	if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
140	find_section(page_text, page_name, page_slug, True)
141	else:
142	pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
143
144	# Test an intrawiki link and look for a section link if applicable
145	def test_intrawiki_link(iw_url, page_name, page_slug):
146	global advice_issued
147	global errors_issued
148
149	response = fetch(iw_url)
150
151	# One way we tell that a redirect occurred is by checking fetch's history, as it
152	# automatically follows redirects. This will catch formal redirects which come from pages
153	# such as Special:PermanentLink.
154	permalink1 = 'Special:PermanentLink/'.lower()
155	permalink2 = 'Special:Permalink/'.lower()
156	page_slug_lower = page_slug.lower()
157	if response.history != [] and (page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2)):
158	if debug:
159	possibly_print(page_name)
160	pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
161	find_canonical_link(response.text, page_name, page_slug)
162	# However the usual way that a redirect occurs is that a redirect page is visited and
163	# MediaWiki sends us to the new page using JavaScript while returning code 301. Formerly it
164	# used to return 200 as if the link was correct, so rather than looking for code 301 we
165	# detect these soft redirects by looking at the page source to find the redirect note that
166	# gets inserted at the top of the page for the reader.
167	elif 'Redirected from <a' in response.text:
168	if debug:
169	possibly_print(page_name)
170	pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
171	find_canonical_link(response.text, page_name, page_slug)
172	# This handles response codes other than 200 and 301 (301 is returned in the above case of a
173	# silent redirect)
174	elif response.status_code != 200:
175	possibly_print(page_name)
176	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
177	errors_issued += 1
178	else: # URL is OK, so proceed
179	find_section(response.text, page_name, page_slug, False)
180
181	# Searches the given page text for intrawiki links with section links in them
182	def scan_for_intrawiki_links(page_text, page_name):
183	global debug
184	global pages_checked
185	global iw_found
186	global advice_issued
187	global errors_issued
188	global name_printed
189	pages_checked += 1
190	name_printed = 0
191
192	for i, the_pattern in enumerate(link_patterns):
193	if debug:
194	if i == 0:
195	pywikibot.stdout(' Checking page for wikilinks with section names.')
196	elif i == 1:
197	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
198
199	for match in re.finditer(the_pattern, page_text):
200	found_iw_match = False
201	iw_url = ""
202	page_name2 = page_name
203
204	# Cut out the matched text from the page, isolating just the page+section name
205	target_start = 2 # "[["
206	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
207	if i == 1:
208	target_start = 14 # "{{SectionLink\|"
209	target_end = 2 # "}}"
210	s = match.start() + target_start # remove the link-opening markup
211	e = match.end() - target_end # remove the link-ending markup
212	page_slug = page_text[s:e]
213
214	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
215	if i == 1:
216	page_slug = page_slug.replace('\|', '#')
217
218	# Sometimes we use a space char. instead of a '_', so fix that before querying
219	page_slug = page_slug.replace(' ', '_')
220	if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
221
222	# If this link doesn't have a section link in it, then we don't care about it, as
223	# MediaWiki takes care of checking basic intrawiki links
224	if not '#' in page_slug:
225	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
226	continue
227
228	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
229	# for the task of checking interwiki page+section links
230	is_interwiki = False
231	if found_iw_match == False:
232	for prefix in interwiki_prefixes:
233	if prefix + ":" in page_slug:
234	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
235	is_interwiki = True
236	break
237	if is_interwiki:
238	continue
239
240	# If there is a '{' in the link, then probably it's a link built on transcluded text.
241	# If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
242	# our "chapter_names" array. If it's another type of transclusion, punt it to the user.
243	if '{' in page_slug:
244	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
245	ch_link = ch_link_pattern.search(page_slug)
246	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
247	if ch_link:
248	ch_link_match = ch_link.group(0)
249	ch_num_pattern = re.compile("[0-9]+")
250	ch_num = ch_num_pattern.search(ch_link_match)
251	if ch_num:
252	ch_num_match = int(ch_num.group(0))
253	if ch_num_match >= 0 and ch_num_match <= 14:
254	ch_name = chapter_names[ch_num_match]
255	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
256	page_slug = replace_pattern.sub(ch_name, page_slug)
257	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
258	else:
259	possibly_print(page_name)
260	pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
261	errors_issued += 1
262	continue
263	else:
264	possibly_print(page_name)
265	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
266	advice_issued += 1
267	continue
268	else:
269	possibly_print(page_name)
270	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
271	advice_issued += 1
272	continue
273
274	# If this is a relative "/" link, use the current page as the basis for the URL. Note
275	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
276	# we're out of luck.
277	if page_slug.startswith('/'):
278	page_slug = page_name + page_slug
279	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
280
281	# If this is a relative "../" link, find the parent page, set ourselves to that page,
282	# then remove the relative portion of the link. Note that this is only performed once,
283	# so if there's multiple steps back ("../../"), we're out of luck.
284	if page_slug.startswith('../'):
285	last_slash = page_name.rfind('/')
286	page_name2 = page_name[0:last_slash]
287	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
288	page_slug = page_slug[3:len(page_slug)]
289	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
290	# If this is now going to be a bare section link for the parent page, don't add a
291	# slash, otherwise do because we are drilling down to another subpage
292	if page_slug.startswith('#'):
293	page_slug = page_name2 + page_slug
294	else:
295	page_slug = page_name2 + '/' + page_slug
296
297	# If this is a bare section link, build URL based on this page
298	if page_slug.startswith('#'):
299	iw_url = onigalore_url + page_name2
300	iw_found += 1
301	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
302	found_iw_match = True
303	page_slug = page_name2 + page_slug
304
305	# If there's no ":" in the link (before the section link, where a colon would just be
306	# part of the text) then it's a Main namespace article; proceed with building URL
307	if found_iw_match == False:
308	if not re.search(":.*#", page_slug):
309	iw_url = onigalore_url + page_slug
310	iw_found += 1
311	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
312	found_iw_match = True
313
314	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
315	# before building URL
316	if found_iw_match == False:
317	for prefix in intrawiki_prefixes:
318	if prefix + ":" in page_slug:
319	iw_url = onigalore_url + page_slug
320	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
321	iw_found += 1
322	found_iw_match = True
323	break
324
325	# If we still haven't turned this match into a URL, something's gone wrong
326	if (found_iw_match == False) or (iw_url == ""):
327	possibly_print(page_name)
328	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
329	continue
330
331	# Test the URL
332	iw_url = iw_url.replace(' ', '_')
333	if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
334	test_intrawiki_link(iw_url, page_name, page_slug)
335
336	# Print a wrap-up message
337	def print_summary():
338	global pages_checked
339	global iw_found
340	global advice_issued
341	global errors_issued
342
343	page_str = "pages"
344	if pages_checked == 1:
345	page_str = "page"
346
347	link_str = "links"
348	if iw_found == 1:
349	link_str = "link"
350
351	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
352	pywikibot.stdout('While attempting to follow section links….')
353
354	if advice_issued == 0:
355	pywikibot.stdout(' No advice on potential problems was issued.')
356	elif advice_issued == 1:
357	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
358	else:
359	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
360
361	error_str = "errors were"
362	if errors_issued == 1:
363	error_str = "error was"
364	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
365
366	# Main function
367	def main(*args):
368	global debug
369	search_cat = ''
370	search_page = ''
371
372	# Process arguments
373	local_args = pywikibot.handle_args(args)
374	for arg in local_args:
375	if arg.startswith('-cat:'):
376	search_cat = arg[5:]
377	elif arg.startswith('-page:'):
378	search_page = arg[6:]
379	elif arg == '-dbg':
380	debug = 1
381	else:
382	pywikibot.stdout('Unknown argument "{}".'.format(arg))
383	return
384
385	site = pywikibot.Site()
386
387	# This line of code enumerates the methods in the 'page' class
388	#pywikibot.stdout(format(dir(page)))
389
390	# Check specified page or loop through specified category and check all pages
391	if search_cat != '':
392	cat_obj = pywikibot.Category(site, search_cat)
393	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
394	for page in pagegenerators.PreloadingGenerator(generator, 100):
395	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
396	scan_for_intrawiki_links(page.text, page.title())
397	elif search_page != '':
398	page = pywikibot.Page(site, search_page)
399	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
400	scan_for_intrawiki_links(page.text, page.title())
401
402	# Print the results
403	print_summary()
404
405	if __name__ == '__main__':
406	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1211

Download in other formats: