Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1186

Last change on this file since 1186 was 1186, checked in by iritscen, 12 months ago
ValBot: Forgot to update one variable name in check_intrawiki_section_links.py.
File size: 18.2 KB

Line
1	# Check Intrawiki Section Links
2	# by iritscen@yahoo.com
3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4	# and loads the linked page and verifies that the named section actually exists. It also
5	# understands section links generated through a call to Template:SectionLink.
6	# Recommended viewing width:
7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
8
9	import os
10
11	from urllib.parse import urljoin
12
13	import pywikibot
14	import re
15
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot import pagegenerators
18	from pywikibot.tools.formatter import color_format
19	from pywikibot.comms.http import fetch
20	from pywikibot.specialbots import UploadRobot
21	from bs4 import BeautifulSoup
22
23	# Tuple of OniGalore's namespaces
24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26	# URL for main namespace of our wiki
27	onigalore_url = 'https://wiki.oni2.net/'
28
29	# Tuple of interwiki prefixes, for recognizing and passing over such links
30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35	# Tuple of patterns for recognizing wikilinks
36	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
37	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
38	link_patterns = ("\[\[[^\|\]](\\|\|\])", "\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
39
40	# Initialize globals
41	debug = 0
42	pages_checked = 0
43	iw_found = 0
44	advice_issued = 0
45	errors_issued = 0
46	name_printed = 0
47
48	# Prints the name of a page on which something occurred, if it has not been printed before
49	def possibly_print(page_name):
50	global debug
51	global name_printed
52
53	if not name_printed and not debug:
54	pywikibot.stdout('')
55	pywikibot.stdout('From page "{}":'.format(page_name))
56	name_printed = 1
57
58	# Search a page for the section specified in the link
59	def find_section(page_text, page_name, page_slug, print_result):
60	global errors_issued
61
62	# Isolate section link
63	target_page_name, anchor_name = page_slug.split('#', 1)
64	target_page_name_human = target_page_name.replace('_', ' ')
65	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
66
67	# Convert slash character to the dot-notation hex encoding that MediaWiki uses
68	anchor_name = anchor_name.replace('/', '.2F')
69
70	# Read linked page to see if it really has this anchor link
71	soup = BeautifulSoup(page_text, 'html.parser')
72	found_section = False
73	for span_tag in soup.findAll('span'):
74	span_name = span_tag.get('id', None)
75	if span_name == anchor_name:
76	if debug and not print_result: pywikibot.stdout(' Found section in a span!')
77	found_section = True
78	break
79	if found_section == False:
80	# Search for a div with this ID
81	for span_tag in soup.findAll('div'):
82	span_name = span_tag.get('id', None)
83	if span_name == anchor_name:
84	if debug and not print_result: pywikibot.stdout(' Found section in a div!')
85	found_section = True
86	break
87	if found_section == False:
88	possibly_print(page_name)
89	pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
90	errors_issued += 1
91	elif debug and print_result:
92	pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
93
94	# For a link that redirected us to another page, extract the name of the target page from
95	# the target page's source
96	def find_canonical_link(page_text, page_name, page_slug):
97	# Extract link from this markup which contains name of redirected-to page:
98	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
99	# "wgPageName":"Namespace:Page_name",
100	canonical_name = page_text.split('"wgPageName":"')[-1]
101	tag_end = canonical_name.find('",')
102
103	if tag_end == -1:
104	pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
105	errors_issued = errors_issued + 1
106	else:
107	canonical_name = canonical_name[:tag_end]
108	if len(canonical_name) > 100:
109	# Certain things can cause the trim to fail; report error and avoid slamming the
110	# output with massive page source from a failed trim
111	pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
112	errors_issued = errors_issued + 1
113	else:
114	canonical_name = canonical_name.replace('_', ' ')
115	if '#' in page_slug:
116	_, anchor_name = page_slug.split('#')
117	if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
118	find_section(page_text, page_name, page_slug, True)
119	else:
120	pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
121
122	# Test an intrawiki link and look for a section link if applicable
123	def test_intrawiki_link(iw_url, page_name, page_slug):
124	global advice_issued
125	global errors_issued
126
127	response = fetch(iw_url)
128
129	# One way we tell that a redirect occurred is by checking fetch's history, as it
130	# automatically follows redirects. This will catch formal redirects which come from pages
131	# such as Special:PermanentLink.
132	if response.history != []:
133
134	permalink1 = 'Special:PermanentLink/'.lower()
135	permalink2 = 'Special:Permalink/'.lower()
136	page_slug_lower = page_slug.lower()
137	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
138	if debug:
139	possibly_print(page_name)
140	pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
141	find_canonical_link(response.text, page_name, page_slug)
142	else:
143	possibly_print(page_name)
144	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
145	advice_issued += 1
146	elif response.status_code != 200:
147	possibly_print(page_name)
148	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
149	errors_issued += 1
150	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
151	# using JavaScript, while returning code OK 200 as if the link was correct; this happens
152	# when a redirect page is accessed. We must detect these soft redirects by looking at the
153	# page source to find the redirect note inserted at the top of the page for the reader.
154	elif 'Redirected from <a' in response.text:
155	if debug:
156	possibly_print(page_name)
157	pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
158	find_canonical_link(response.text, page_name, page_slug)
159	else: # URL is OK, so proceed
160	find_section(response.text, page_name, page_slug, False)
161
162	# Searches the given page text for intrawiki links with section links in them
163	def scan_for_intrawiki_links(page_text, page_name):
164	global debug
165	global pages_checked
166	global iw_found
167	global advice_issued
168	global errors_issued
169	global name_printed
170	pages_checked += 1
171	name_printed = 0
172
173	for i, the_pattern in enumerate(link_patterns):
174	if debug:
175	if i == 0:
176	pywikibot.stdout(' Checking page for wikilinks with section names.')
177	elif i == 1:
178	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
179
180	for match in re.finditer(the_pattern, page_text):
181	found_iw_match = False
182	iw_url = ""
183	page_name2 = page_name
184
185	# Cut out the matched text from the page, isolating just the page+section name
186	target_start = 2 # "[["
187	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
188	if i == 1:
189	target_start = 14 # "{{SectionLink\|"
190	target_end = 2 # "}}"
191	s = match.start() + target_start # remove the link-opening markup
192	e = match.end() - target_end # remove the link-ending markup
193	page_slug = page_text[s:e]
194
195	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
196	if i == 1:
197	page_slug = page_slug.replace('\|', '#')
198
199	# Sometimes we use a space char. instead of a '_', so fix that before querying
200	page_slug = page_slug.replace(' ', '_')
201	if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
202
203	# If this link doesn't have a section link in it, then we don't care about it, as
204	# MediaWiki takes care of checking basic intrawiki links
205	if not '#' in page_slug:
206	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
207	continue
208
209	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
210	# for the task of checking interwiki page+section links
211	is_interwiki = False
212	if found_iw_match == False:
213	for prefix in interwiki_prefixes:
214	if prefix + ":" in page_slug:
215	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
216	is_interwiki = True
217	break
218	if is_interwiki:
219	continue
220
221	# If there is a '{' in the link, then probably it's a link built on transcluded text.
222	# If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
223	# our "chapter_names" array. If it's another type of transclusion, punt it to the user.
224	if '{' in page_slug:
225	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
226	ch_link = ch_link_pattern.search(page_slug)
227	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
228	if ch_link:
229	ch_link_match = ch_link.group(0)
230	ch_num_pattern = re.compile("[0-9]+")
231	ch_num = ch_num_pattern.search(ch_link_match)
232	if ch_num:
233	ch_num_match = int(ch_num.group(0))
234	if ch_num_match >= 0 and ch_num_match <= 14:
235	ch_name = chapter_names[ch_num_match]
236	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
237	page_slug = replace_pattern.sub(ch_name, page_slug)
238	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
239	else:
240	possibly_print(page_name)
241	pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
242	errors_issued += 1
243	continue
244	else:
245	possibly_print(page_name)
246	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
247	advice_issued += 1
248	continue
249	else:
250	possibly_print(page_name)
251	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
252	advice_issued += 1
253	continue
254
255	# If this is a relative "/" link, use the current page as the basis for the URL. Note
256	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
257	# we're out of luck.
258	if page_slug.startswith('/'):
259	page_slug = page_name + page_slug
260	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
261
262	# If this is a relative "../" link, find the parent page, set ourselves to that page,
263	# then remove the relative portion of the link. Note that this is only performed once,
264	# so if there's multiple steps back ("../../"), we're out of luck.
265	if page_slug.startswith('../'):
266	last_slash = page_name.rfind('/')
267	page_name2 = page_name[0:last_slash]
268	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
269	page_slug = page_slug[3:len(page_slug)]
270	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
271	# If this is now going to be a bare section link for the parent page, don't add a
272	# slash, otherwise do because we are drilling down to another subpage
273	if page_slug.startswith('#'):
274	page_slug = page_name2 + page_slug
275	else:
276	page_slug = page_name2 + '/' + page_slug
277
278	# If this is a bare section link, build URL based on this page
279	if page_slug.startswith('#'):
280	iw_url = onigalore_url + page_name2
281	iw_found += 1
282	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
283	found_iw_match = True
284	page_slug = page_name2 + page_slug
285
286	# If there's no ":" in the link (before the section link, where a colon would just be
287	# part of the text) then it's a Main namespace article; proceed with building URL
288	if found_iw_match == False:
289	if not re.search(":.*#", page_slug):
290	iw_url = onigalore_url + page_slug
291	iw_found += 1
292	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
293	found_iw_match = True
294
295	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
296	# before building URL
297	if found_iw_match == False:
298	for prefix in intrawiki_prefixes:
299	if prefix + ":" in page_slug:
300	iw_url = onigalore_url + page_slug
301	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
302	iw_found += 1
303	found_iw_match = True
304	break
305
306	# If we still haven't turned this match into a URL, something's gone wrong
307	if (found_iw_match == False) or (iw_url == ""):
308	possibly_print(page_name)
309	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
310	continue
311
312	# Test the URL
313	iw_url = iw_url.replace(' ', '_')
314	if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
315	test_intrawiki_link(iw_url, page_name, page_slug)
316
317	# Print a wrap-up message
318	def print_summary():
319	global pages_checked
320	global iw_found
321	global advice_issued
322	global errors_issued
323
324	page_str = "pages"
325	if pages_checked == 1:
326	page_str = "page"
327
328	link_str = "links"
329	if iw_found == 1:
330	link_str = "link"
331
332	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
333	pywikibot.stdout('While attempting to follow section links….')
334
335	if advice_issued == 0:
336	pywikibot.stdout(' No advice on potential problems was issued.')
337	elif advice_issued == 1:
338	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
339	else:
340	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
341
342	error_str = "errors were"
343	if errors_issued == 1:
344	error_str = "error was"
345	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
346
347	# Main function
348	def main(*args):
349	global debug
350	search_cat = ''
351	search_page = ''
352
353	# Process arguments
354	local_args = pywikibot.handle_args(args)
355	for arg in local_args:
356	if arg.startswith('-cat:'):
357	search_cat = arg[5:]
358	elif arg.startswith('-page:'):
359	search_page = arg[6:]
360	elif arg == '-dbg':
361	debug = 1
362	else:
363	pywikibot.stdout('Unknown argument "{}".'.format(arg))
364	return
365
366	site = pywikibot.Site()
367
368	# This line of code enumerates the methods in the 'page' class
369	#pywikibot.stdout(format(dir(page)))
370
371	# Check specified page or loop through specified category and check all pages
372	if search_cat != '':
373	cat_obj = pywikibot.Category(site, search_cat)
374	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
375	for page in pagegenerators.PreloadingGenerator(generator, 100):
376	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
377	scan_for_intrawiki_links(page.text, page.title())
378	elif search_page != '':
379	page = pywikibot.Page(site, search_page)
380	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
381	scan_for_intrawiki_links(page.text, page.title())
382
383	# Print the results
384	print_summary()
385
386	if __name__ == '__main__':
387	main()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: