Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1192

Last change on this file since 1192 was 1192, checked in by iritscen, 2 months ago
ValBot: Corrected syntax of two strings with regex patterns.
File size: 18.0 KB

Line
1	# Check Intrawiki Section Links
2	# by iritscen@yahoo.com
3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4	# and loads the linked page and verifies that the named section actually exists. It also
5	# understands section links generated through a call to Template:SectionLink.
6	# Recommended viewing width:
7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
8
9	import os
10
11	from urllib.parse import urljoin
12
13	import pywikibot
14	import re
15
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot import pagegenerators
18	from pywikibot.tools.formatter import color_format
19	from pywikibot.comms.http import fetch
20	from pywikibot.specialbots import UploadRobot
21	from bs4 import BeautifulSoup
22
23	# Tuple of OniGalore's namespaces
24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26	# URL for main namespace of our wiki
27	onigalore_url = 'https://wiki.oni2.net/'
28
29	# Tuple of interwiki prefixes, for recognizing and passing over such links
30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35	# Tuple of patterns for recognizing wikilinks
36	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
37	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
38	link_patterns = (r"\[\[[^\|\]](\\|\|\])", r"\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
39
40	# Initialize globals
41	debug = 0
42	pages_checked = 0
43	iw_found = 0
44	advice_issued = 0
45	errors_issued = 0
46	name_printed = 0
47
48	# Prints the name of a page on which something occurred, if it has not been printed before
49	def possibly_print(page_name):
50	global debug
51	global name_printed
52
53	if not name_printed and not debug:
54	pywikibot.stdout('')
55	pywikibot.stdout('From page "{}":'.format(page_name))
56	name_printed = 1
57
58	# Search a page for the section specified in the link
59	def find_section(page_text, page_name, page_slug, print_result):
60	global errors_issued
61
62	# Isolate section link
63	target_page_name, anchor_name = page_slug.split('#', 1)
64	target_page_name_human = target_page_name.replace('_', ' ')
65	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
66
67	# Read linked page to see if it really has this anchor link
68	soup = BeautifulSoup(page_text, 'html.parser')
69	found_section = False
70	for span_tag in soup.findAll('span'):
71	span_name = span_tag.get('id', None)
72	if span_name == anchor_name:
73	if debug and not print_result: pywikibot.stdout(' Found section in a span!')
74	found_section = True
75	break
76	if found_section == False:
77	# Search for a div with this ID
78	for span_tag in soup.findAll('div'):
79	span_name = span_tag.get('id', None)
80	if span_name == anchor_name:
81	if debug and not print_result: pywikibot.stdout(' Found section in a div!')
82	found_section = True
83	break
84	if found_section == False:
85	possibly_print(page_name)
86	pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
87	errors_issued += 1
88	elif debug and print_result:
89	pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
90
91	# For a link that redirected us to another page, extract the name of the target page from
92	# the target page's source
93	def find_canonical_link(page_text, page_name, page_slug):
94	# Extract link from this markup which contains name of redirected-to page:
95	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
96	# "wgPageName":"Namespace:Page_name",
97	canonical_name = page_text.split('"wgPageName":"')[-1]
98	tag_end = canonical_name.find('",')
99
100	if tag_end == -1:
101	pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
102	errors_issued = errors_issued + 1
103	else:
104	canonical_name = canonical_name[:tag_end]
105	if len(canonical_name) > 100:
106	# Certain things can cause the trim to fail; report error and avoid slamming the
107	# output with massive page source from a failed trim
108	pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
109	errors_issued = errors_issued + 1
110	else:
111	canonical_name = canonical_name.replace('_', ' ')
112	if '#' in page_slug:
113	_, anchor_name = page_slug.split('#')
114	if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
115	find_section(page_text, page_name, page_slug, True)
116	else:
117	pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
118
119	# Test an intrawiki link and look for a section link if applicable
120	def test_intrawiki_link(iw_url, page_name, page_slug):
121	global advice_issued
122	global errors_issued
123
124	response = fetch(iw_url)
125
126	# One way we tell that a redirect occurred is by checking fetch's history, as it
127	# automatically follows redirects. This will catch formal redirects which come from pages
128	# such as Special:PermanentLink.
129	if response.history != []:
130
131	permalink1 = 'Special:PermanentLink/'.lower()
132	permalink2 = 'Special:Permalink/'.lower()
133	page_slug_lower = page_slug.lower()
134	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
135	if debug:
136	possibly_print(page_name)
137	pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
138	find_canonical_link(response.text, page_name, page_slug)
139	else:
140	possibly_print(page_name)
141	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
142	advice_issued += 1
143	elif response.status_code != 200:
144	possibly_print(page_name)
145	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
146	errors_issued += 1
147	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
148	# using JavaScript, while returning code OK 200 as if the link was correct; this happens
149	# when a redirect page is accessed. We must detect these soft redirects by looking at the
150	# page source to find the redirect note inserted at the top of the page for the reader.
151	elif 'Redirected from <a' in response.text:
152	if debug:
153	possibly_print(page_name)
154	pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
155	find_canonical_link(response.text, page_name, page_slug)
156	else: # URL is OK, so proceed
157	find_section(response.text, page_name, page_slug, False)
158
159	# Searches the given page text for intrawiki links with section links in them
160	def scan_for_intrawiki_links(page_text, page_name):
161	global debug
162	global pages_checked
163	global iw_found
164	global advice_issued
165	global errors_issued
166	global name_printed
167	pages_checked += 1
168	name_printed = 0
169
170	for i, the_pattern in enumerate(link_patterns):
171	if debug:
172	if i == 0:
173	pywikibot.stdout(' Checking page for wikilinks with section names.')
174	elif i == 1:
175	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
176
177	for match in re.finditer(the_pattern, page_text):
178	found_iw_match = False
179	iw_url = ""
180	page_name2 = page_name
181
182	# Cut out the matched text from the page, isolating just the page+section name
183	target_start = 2 # "[["
184	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
185	if i == 1:
186	target_start = 14 # "{{SectionLink\|"
187	target_end = 2 # "}}"
188	s = match.start() + target_start # remove the link-opening markup
189	e = match.end() - target_end # remove the link-ending markup
190	page_slug = page_text[s:e]
191
192	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
193	if i == 1:
194	page_slug = page_slug.replace('\|', '#')
195
196	# Sometimes we use a space char. instead of a '_', so fix that before querying
197	page_slug = page_slug.replace(' ', '_')
198	if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
199
200	# If this link doesn't have a section link in it, then we don't care about it, as
201	# MediaWiki takes care of checking basic intrawiki links
202	if not '#' in page_slug:
203	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
204	continue
205
206	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
207	# for the task of checking interwiki page+section links
208	is_interwiki = False
209	if found_iw_match == False:
210	for prefix in interwiki_prefixes:
211	if prefix + ":" in page_slug:
212	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
213	is_interwiki = True
214	break
215	if is_interwiki:
216	continue
217
218	# If there is a '{' in the link, then probably it's a link built on transcluded text.
219	# If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
220	# our "chapter_names" array. If it's another type of transclusion, punt it to the user.
221	if '{' in page_slug:
222	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
223	ch_link = ch_link_pattern.search(page_slug)
224	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
225	if ch_link:
226	ch_link_match = ch_link.group(0)
227	ch_num_pattern = re.compile("[0-9]+")
228	ch_num = ch_num_pattern.search(ch_link_match)
229	if ch_num:
230	ch_num_match = int(ch_num.group(0))
231	if ch_num_match >= 0 and ch_num_match <= 14:
232	ch_name = chapter_names[ch_num_match]
233	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
234	page_slug = replace_pattern.sub(ch_name, page_slug)
235	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
236	else:
237	possibly_print(page_name)
238	pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
239	errors_issued += 1
240	continue
241	else:
242	possibly_print(page_name)
243	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
244	advice_issued += 1
245	continue
246	else:
247	possibly_print(page_name)
248	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
249	advice_issued += 1
250	continue
251
252	# If this is a relative "/" link, use the current page as the basis for the URL. Note
253	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
254	# we're out of luck.
255	if page_slug.startswith('/'):
256	page_slug = page_name + page_slug
257	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
258
259	# If this is a relative "../" link, find the parent page, set ourselves to that page,
260	# then remove the relative portion of the link. Note that this is only performed once,
261	# so if there's multiple steps back ("../../"), we're out of luck.
262	if page_slug.startswith('../'):
263	last_slash = page_name.rfind('/')
264	page_name2 = page_name[0:last_slash]
265	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
266	page_slug = page_slug[3:len(page_slug)]
267	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
268	# If this is now going to be a bare section link for the parent page, don't add a
269	# slash, otherwise do because we are drilling down to another subpage
270	if page_slug.startswith('#'):
271	page_slug = page_name2 + page_slug
272	else:
273	page_slug = page_name2 + '/' + page_slug
274
275	# If this is a bare section link, build URL based on this page
276	if page_slug.startswith('#'):
277	iw_url = onigalore_url + page_name2
278	iw_found += 1
279	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
280	found_iw_match = True
281	page_slug = page_name2 + page_slug
282
283	# If there's no ":" in the link (before the section link, where a colon would just be
284	# part of the text) then it's a Main namespace article; proceed with building URL
285	if found_iw_match == False:
286	if not re.search(":.*#", page_slug):
287	iw_url = onigalore_url + page_slug
288	iw_found += 1
289	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
290	found_iw_match = True
291
292	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
293	# before building URL
294	if found_iw_match == False:
295	for prefix in intrawiki_prefixes:
296	if prefix + ":" in page_slug:
297	iw_url = onigalore_url + page_slug
298	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
299	iw_found += 1
300	found_iw_match = True
301	break
302
303	# If we still haven't turned this match into a URL, something's gone wrong
304	if (found_iw_match == False) or (iw_url == ""):
305	possibly_print(page_name)
306	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
307	continue
308
309	# Test the URL
310	iw_url = iw_url.replace(' ', '_')
311	if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
312	test_intrawiki_link(iw_url, page_name, page_slug)
313
314	# Print a wrap-up message
315	def print_summary():
316	global pages_checked
317	global iw_found
318	global advice_issued
319	global errors_issued
320
321	page_str = "pages"
322	if pages_checked == 1:
323	page_str = "page"
324
325	link_str = "links"
326	if iw_found == 1:
327	link_str = "link"
328
329	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
330	pywikibot.stdout('While attempting to follow section links….')
331
332	if advice_issued == 0:
333	pywikibot.stdout(' No advice on potential problems was issued.')
334	elif advice_issued == 1:
335	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
336	else:
337	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
338
339	error_str = "errors were"
340	if errors_issued == 1:
341	error_str = "error was"
342	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
343
344	# Main function
345	def main(*args):
346	global debug
347	search_cat = ''
348	search_page = ''
349
350	# Process arguments
351	local_args = pywikibot.handle_args(args)
352	for arg in local_args:
353	if arg.startswith('-cat:'):
354	search_cat = arg[5:]
355	elif arg.startswith('-page:'):
356	search_page = arg[6:]
357	elif arg == '-dbg':
358	debug = 1
359	else:
360	pywikibot.stdout('Unknown argument "{}".'.format(arg))
361	return
362
363	site = pywikibot.Site()
364
365	# This line of code enumerates the methods in the 'page' class
366	#pywikibot.stdout(format(dir(page)))
367
368	# Check specified page or loop through specified category and check all pages
369	if search_cat != '':
370	cat_obj = pywikibot.Category(site, search_cat)
371	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
372	for page in pagegenerators.PreloadingGenerator(generator, 100):
373	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
374	scan_for_intrawiki_links(page.text, page.title())
375	elif search_page != '':
376	page = pywikibot.Page(site, search_page)
377	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
378	scan_for_intrawiki_links(page.text, page.title())
379
380	# Print the results
381	print_summary()
382
383	if __name__ == '__main__':
384	main()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: