Context Navigation

check_intrawiki_section_links.py@ 1194

Last change on this file since 1194 was 1194, checked in by iritscen, 11 months ago
ValBot: check_intrawiki_section_links.py now understands text fragment directives.
File size: 19.4 KB

Line
1	# Check Intrawiki Section Links
2	# by iritscen@yahoo.com
3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4	# and loads the linked page and verifies that the named section actually exists. It also
5	# understands section links generated through a call to Template:SectionLink.
6	# Recommended viewing width:
7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
8
9	import os
10
11	from urllib.parse import urljoin
12
13	import pywikibot
14	import re
15
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot import pagegenerators
18	from pywikibot.tools.formatter import color_format
19	from pywikibot.comms.http import fetch
20	from pywikibot.specialbots import UploadRobot
21	from bs4 import BeautifulSoup
22
23	# Tuple of OniGalore's namespaces
24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26	# URL for main namespace of our wiki
27	onigalore_url = 'https://wiki.oni2.net/'
28
29	# Tuple of interwiki prefixes, for recognizing and passing over such links
30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35	# Tuple of patterns for recognizing wikilinks
36	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
37	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
38	link_patterns = (r"\[\[[^\|\]](\\|\|\])", r"\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
39
40	# Initialize globals
41	debug = 0
42	pages_checked = 0
43	iw_found = 0
44	advice_issued = 0
45	errors_issued = 0
46	name_printed = 0
47
48	# Prints the name of a page on which something occurred, if it has not been printed before
49	def possibly_print(page_name):
50	global debug
51	global name_printed
52
53	if not name_printed and not debug:
54	pywikibot.stdout('')
55	pywikibot.stdout('From page "{}":'.format(page_name))
56	name_printed = 1
57
58	# Search a page for the section specified in the link
59	def find_section(page_text, page_name, page_slug, print_result):
60	global errors_issued
61	found_section = False
62
63	# Isolate section link or text fragment link
64	target_page_name, anchor_name = page_slug.split('#', 1)
65	target_page_name_human = target_page_name.replace('_', ' ')
66
67	# First check if this is a text fragment directive, and look for it if so
68	if anchor_name.startswith(':~:text='):
69	if debug: pywikibot.stdout(' Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug))
70	anchor_name = anchor_name[8:]
71	# We're only checking the first text directive, so strip add'l ones if present
72	addl_fragment = anchor_name.find('&text=')
73	if addl_fragment != -1:
74	anchor_name = anchor_name[:addl_fragment]
75	search_terms = anchor_name.split(',')
76	# Delete prefix and suffix terms because they aren't needed
77	if search_terms[0].endswith('-'):
78	search_terms.pop(0)
79	if search_terms[-1].startswith('-'):
80	search_terms.pop()
81	# Remake text directive with the terms separated by spaces as they should be in the page text
82	newSep = ' '
83	search_string = newSep.join(search_terms)
84	if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string))
85	if search_string in page_text:
86	found_section = True
87	if debug and not print_result: pywikibot.stdout(' Found text fragment!')
88
89	# If we're still here, it's a section link; read linked page to see if it really has this
90	# anchor link
91	if found_section == False:
92	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
93	soup = BeautifulSoup(page_text, 'html.parser')
94	# Search for a span with this ID
95	for span_tag in soup.findAll('span'):
96	span_name = span_tag.get('id', None)
97	if span_name == anchor_name:
98	if debug and not print_result: pywikibot.stdout(' Found section in a span!')
99	found_section = True
100	break
101	if found_section == False:
102	# Search for a div with this ID
103	for span_tag in soup.findAll('div'):
104	span_name = span_tag.get('id', None)
105	if span_name == anchor_name:
106	if debug and not print_result: pywikibot.stdout(' Found section in a div!')
107	found_section = True
108	break
109	if found_section == False:
110	possibly_print(page_name)
111	pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
112	errors_issued += 1
113	elif debug and print_result:
114	pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
115
116	# For a link that redirected us to another page, extract the name of the target page from
117	# the target page's source
118	def find_canonical_link(page_text, page_name, page_slug):
119	# Extract link from this markup which contains name of redirected-to page:
120	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
121	# "wgPageName":"Namespace:Page_name",
122	canonical_name = page_text.split('"wgPageName":"')[-1]
123	tag_end = canonical_name.find('",')
124
125	if tag_end == -1:
126	pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
127	errors_issued = errors_issued + 1
128	else:
129	canonical_name = canonical_name[:tag_end]
130	if len(canonical_name) > 100:
131	# Certain things can cause the trim to fail; report error and avoid slamming the
132	# output with massive page source from a failed trim
133	pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
134	errors_issued = errors_issued + 1
135	else:
136	canonical_name = canonical_name.replace('_', ' ')
137	if '#' in page_slug:
138	_, anchor_name = page_slug.split('#')
139	if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
140	find_section(page_text, page_name, page_slug, True)
141	else:
142	pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
143
144	# Test an intrawiki link and look for a section link if applicable
145	def test_intrawiki_link(iw_url, page_name, page_slug):
146	global advice_issued
147	global errors_issued
148
149	response = fetch(iw_url)
150
151	# One way we tell that a redirect occurred is by checking fetch's history, as it
152	# automatically follows redirects. This will catch formal redirects which come from pages
153	# such as Special:PermanentLink.
154	if response.history != []:
155	permalink1 = 'Special:PermanentLink/'.lower()
156	permalink2 = 'Special:Permalink/'.lower()
157	page_slug_lower = page_slug.lower()
158	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
159	if debug:
160	possibly_print(page_name)
161	pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
162	find_canonical_link(response.text, page_name, page_slug)
163	else:
164	possibly_print(page_name)
165	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
166	advice_issued += 1
167	elif response.status_code != 200:
168	possibly_print(page_name)
169	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
170	errors_issued += 1
171	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
172	# using JavaScript, while returning code OK 200 as if the link was correct; this happens
173	# when a redirect page is accessed. We must detect these soft redirects by looking at the
174	# page source to find the redirect note inserted at the top of the page for the reader.
175	elif 'Redirected from <a' in response.text:
176	if debug:
177	possibly_print(page_name)
178	pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
179	find_canonical_link(response.text, page_name, page_slug)
180	else: # URL is OK, so proceed
181	find_section(response.text, page_name, page_slug, False)
182
183	# Searches the given page text for intrawiki links with section links in them
184	def scan_for_intrawiki_links(page_text, page_name):
185	global debug
186	global pages_checked
187	global iw_found
188	global advice_issued
189	global errors_issued
190	global name_printed
191	pages_checked += 1
192	name_printed = 0
193
194	for i, the_pattern in enumerate(link_patterns):
195	if debug:
196	if i == 0:
197	pywikibot.stdout(' Checking page for wikilinks with section names.')
198	elif i == 1:
199	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
200
201	for match in re.finditer(the_pattern, page_text):
202	found_iw_match = False
203	iw_url = ""
204	page_name2 = page_name
205
206	# Cut out the matched text from the page, isolating just the page+section name
207	target_start = 2 # "[["
208	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
209	if i == 1:
210	target_start = 14 # "{{SectionLink\|"
211	target_end = 2 # "}}"
212	s = match.start() + target_start # remove the link-opening markup
213	e = match.end() - target_end # remove the link-ending markup
214	page_slug = page_text[s:e]
215
216	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
217	if i == 1:
218	page_slug = page_slug.replace('\|', '#')
219
220	# Sometimes we use a space char. instead of a '_', so fix that before querying
221	page_slug = page_slug.replace(' ', '_')
222	if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
223
224	# If this link doesn't have a section link in it, then we don't care about it, as
225	# MediaWiki takes care of checking basic intrawiki links
226	if not '#' in page_slug:
227	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
228	continue
229
230	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
231	# for the task of checking interwiki page+section links
232	is_interwiki = False
233	if found_iw_match == False:
234	for prefix in interwiki_prefixes:
235	if prefix + ":" in page_slug:
236	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
237	is_interwiki = True
238	break
239	if is_interwiki:
240	continue
241
242	# If there is a '{' in the link, then probably it's a link built on transcluded text.
243	# If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
244	# our "chapter_names" array. If it's another type of transclusion, punt it to the user.
245	if '{' in page_slug:
246	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
247	ch_link = ch_link_pattern.search(page_slug)
248	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
249	if ch_link:
250	ch_link_match = ch_link.group(0)
251	ch_num_pattern = re.compile("[0-9]+")
252	ch_num = ch_num_pattern.search(ch_link_match)
253	if ch_num:
254	ch_num_match = int(ch_num.group(0))
255	if ch_num_match >= 0 and ch_num_match <= 14:
256	ch_name = chapter_names[ch_num_match]
257	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
258	page_slug = replace_pattern.sub(ch_name, page_slug)
259	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
260	else:
261	possibly_print(page_name)
262	pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
263	errors_issued += 1
264	continue
265	else:
266	possibly_print(page_name)
267	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
268	advice_issued += 1
269	continue
270	else:
271	possibly_print(page_name)
272	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
273	advice_issued += 1
274	continue
275
276	# If this is a relative "/" link, use the current page as the basis for the URL. Note
277	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
278	# we're out of luck.
279	if page_slug.startswith('/'):
280	page_slug = page_name + page_slug
281	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
282
283	# If this is a relative "../" link, find the parent page, set ourselves to that page,
284	# then remove the relative portion of the link. Note that this is only performed once,
285	# so if there's multiple steps back ("../../"), we're out of luck.
286	if page_slug.startswith('../'):
287	last_slash = page_name.rfind('/')
288	page_name2 = page_name[0:last_slash]
289	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
290	page_slug = page_slug[3:len(page_slug)]
291	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
292	# If this is now going to be a bare section link for the parent page, don't add a
293	# slash, otherwise do because we are drilling down to another subpage
294	if page_slug.startswith('#'):
295	page_slug = page_name2 + page_slug
296	else:
297	page_slug = page_name2 + '/' + page_slug
298
299	# If this is a bare section link, build URL based on this page
300	if page_slug.startswith('#'):
301	iw_url = onigalore_url + page_name2
302	iw_found += 1
303	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
304	found_iw_match = True
305	page_slug = page_name2 + page_slug
306
307	# If there's no ":" in the link (before the section link, where a colon would just be
308	# part of the text) then it's a Main namespace article; proceed with building URL
309	if found_iw_match == False:
310	if not re.search(":.*#", page_slug):
311	iw_url = onigalore_url + page_slug
312	iw_found += 1
313	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
314	found_iw_match = True
315
316	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
317	# before building URL
318	if found_iw_match == False:
319	for prefix in intrawiki_prefixes:
320	if prefix + ":" in page_slug:
321	iw_url = onigalore_url + page_slug
322	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
323	iw_found += 1
324	found_iw_match = True
325	break
326
327	# If we still haven't turned this match into a URL, something's gone wrong
328	if (found_iw_match == False) or (iw_url == ""):
329	possibly_print(page_name)
330	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
331	continue
332
333	# Test the URL
334	iw_url = iw_url.replace(' ', '_')
335	if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
336	test_intrawiki_link(iw_url, page_name, page_slug)
337
338	# Print a wrap-up message
339	def print_summary():
340	global pages_checked
341	global iw_found
342	global advice_issued
343	global errors_issued
344
345	page_str = "pages"
346	if pages_checked == 1:
347	page_str = "page"
348
349	link_str = "links"
350	if iw_found == 1:
351	link_str = "link"
352
353	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
354	pywikibot.stdout('While attempting to follow section links….')
355
356	if advice_issued == 0:
357	pywikibot.stdout(' No advice on potential problems was issued.')
358	elif advice_issued == 1:
359	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
360	else:
361	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
362
363	error_str = "errors were"
364	if errors_issued == 1:
365	error_str = "error was"
366	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
367
368	# Main function
369	def main(*args):
370	global debug
371	search_cat = ''
372	search_page = ''
373
374	# Process arguments
375	local_args = pywikibot.handle_args(args)
376	for arg in local_args:
377	if arg.startswith('-cat:'):
378	search_cat = arg[5:]
379	elif arg.startswith('-page:'):
380	search_page = arg[6:]
381	elif arg == '-dbg':
382	debug = 1
383	else:
384	pywikibot.stdout('Unknown argument "{}".'.format(arg))
385	return
386
387	site = pywikibot.Site()
388
389	# This line of code enumerates the methods in the 'page' class
390	#pywikibot.stdout(format(dir(page)))
391
392	# Check specified page or loop through specified category and check all pages
393	if search_cat != '':
394	cat_obj = pywikibot.Category(site, search_cat)
395	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
396	for page in pagegenerators.PreloadingGenerator(generator, 100):
397	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
398	scan_for_intrawiki_links(page.text, page.title())
399	elif search_page != '':
400	page = pywikibot.Page(site, search_page)
401	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
402	scan_for_intrawiki_links(page.text, page.title())
403
404	# Print the results
405	print_summary()
406
407	if __name__ == '__main__':
408	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1194

Download in other formats: