Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1179

Last change on this file since 1179 was 1179, checked in by iritscen, 3 years ago
ValBot: check_intrawiki_section_links.py: Simplified output to just advice and errors. Added support for SectionLink template. Added support for links built on chapter name transclusion. Placed verbose output under a "-dbg" argument.
File size: 14.7 KB

Line
1	# Check Intrawiki Section Links
2	# by iritscen@yahoo.com
3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4	# and loads the linked page and verifies that the named section actually exists. It also
5	# understands section links generated through a call to Template:SectionLink.
6	# Recommended viewing width:
7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
8
9	import os
10
11	from urllib.parse import urljoin
12
13	import pywikibot
14	import re
15
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot import pagegenerators
18	from pywikibot.tools.formatter import color_format
19	from pywikibot.comms.http import fetch
20	from pywikibot.specialbots import UploadRobot
21	from bs4 import BeautifulSoup
22
23	# Tuple of OniGalore's namespaces
24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26	# URL for main namespace of our wiki
27	onigalore_url = 'https://wiki.oni2.net/'
28
29	# Tuple of interwiki prefixes, for passing over such links
30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35	# Tuple of patterns for recognizing wikilinks
36	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
37	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
38	link_patterns = ("\[\[[^\|\]](\\|\|\])", "\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
39
40	# Initialize globals
41	debug = 0
42	pages_checked = 0
43	iw_found = 0
44	advice_issued = 0
45	errors_issued = 0
46
47	# Searches the given page text for intrawiki links with section links in them
48	def scan_for_intrawiki_links(page_text, page_name):
49	global debug
50	global pages_checked
51	global iw_found
52	global advice_issued
53	global errors_issued
54	pages_checked += 1
55	name_printed = 0
56
57	for i, the_pattern in enumerate(link_patterns):
58	if debug:
59	if i == 0:
60	pywikibot.stdout(' Checking page for wikilinks with section names.')
61	elif i == 1:
62	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
63
64	for match in re.finditer(the_pattern, page_text):
65	found_iw_match = False
66	iw_url = ""
67	page_name2 = page_name
68
69	# Cut out the matched text from the page, isolating just the page+section name
70	target_start = 2 # "[["
71	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
72	if i == 1:
73	target_start = 14 # "{{SectionLink\|"
74	target_end = 2 # "}}"
75	s = match.start() + target_start # remove the link-opening markup
76	e = match.end() - target_end # remove the link-ending markup
77	link_text = page_text[s:e]
78
79	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
80	if i == 1:
81	link_text = link_text.replace('\|', '#')
82
83	# Sometimes we use a space char. instead of a '_', so fix that before querying
84	link_text = link_text.replace(' ', '_')
85	if debug: pywikibot.stdout(' Found link {0}.'.format(link_text))
86
87	# If this link doesn't have a section link in it, then we don't care about it, as
88	# MediaWiki takes care of checking basic intrawiki links
89	if not '#' in link_text:
90	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
91	continue
92
93	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
94	# for the task of checking interwiki page+section links
95	is_interwiki = False
96	if found_iw_match == False:
97	for prefix in interwiki_prefixes:
98	if prefix + ":" in link_text:
99	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(link_text))
100	is_interwiki = True
101	break
102	if is_interwiki:
103	continue
104
105	# If there is a '{' in the link, then probably it's a link built on transcluded text
106	# like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
107	if '{' in link_text:
108	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
109	ch_link = ch_link_pattern.search(link_text)
110	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
111	if ch_link:
112	ch_link_match = ch_link.group(0)
113	ch_num_pattern = re.compile("[0-9]+")
114	ch_num = ch_num_pattern.search(ch_link_match)
115	if ch_num:
116	ch_num_match = int(ch_num.group(0))
117	if ch_num_match >= 0 and ch_num_match <= 14:
118	ch_name = chapter_names[ch_num_match]
119	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
120	link_text = replace_pattern.sub(ch_name, link_text)
121	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(link_text))
122	else:
123	if not name_printed and not debug:
124	pywikibot.stdout('From page "{}":'.format(page_name))
125	name_printed = 1
126	pywikibot.stdout(' ADVICE: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(link_text, ch_num_match))
127	advice_issued += 1
128	continue
129	else:
130	if not name_printed and not debug:
131	pywikibot.stdout('From page "{}":'.format(page_name))
132	name_printed = 1
133	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(link_text))
134	advice_issued += 1
135	continue
136	else:
137	if not name_printed and not debug:
138	pywikibot.stdout('From page "{}":'.format(page_name))
139	name_printed = 1
140	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(link_text, "{{C7}}"))
141	advice_issued += 1
142	continue
143
144	# If this is a relative "/" link, use the current page as the basis for the URL. Note
145	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
146	# we're out of luck.
147	if link_text.startswith('/'):
148	link_text = page_name + link_text
149	if debug: pywikibot.stdout(' Changed link_text to {} on account of "/".'.format(link_text))
150
151	# If this is a relative "../" link, find the parent page, set ourselves to that page,
152	# then remove the relative portion of the link. Note that this is only performed once,
153	# so if there's multiple steps back ("../../"), we're out of luck.
154	if link_text.startswith('../'):
155	last_slash = page_name.rfind('/')
156	page_name2 = page_name[0:last_slash]
157	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
158	link_text = link_text[3:len(link_text)]
159	if debug: pywikibot.stdout(' Changed link_text to {} on account of "../".'.format(link_text))
160	# If this is now going to be a bare section link for the parent page, don't add a
161	# slash, otherwise do because we are drilling down to another subpage
162	if link_text.startswith('#'):
163	link_text = page_name2 + link_text
164	else:
165	link_text = page_name2 + '/' + link_text
166
167	# If this is a bare section link, build URL based on this page
168	if link_text.startswith('#'):
169	iw_url = onigalore_url + page_name2
170	iw_found += 1
171	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(link_text))
172	found_iw_match = True
173	link_text = page_name2 + link_text
174
175	# If there's no ":" in the link (before the section link, where a colon would just be
176	# part of the text) then it's a Main namespace article; proceed with building URL
177	if found_iw_match == False:
178	if not re.search(":.*#", link_text):
179	iw_url = onigalore_url + link_text
180	iw_found += 1
181	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
182	found_iw_match = True
183
184	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
185	# before building URL
186	if found_iw_match == False:
187	for prefix in intrawiki_prefixes:
188	if prefix + ":" in link_text:
189	iw_url = onigalore_url + link_text
190	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
191	iw_found += 1
192	found_iw_match = True
193	break
194
195	# If we still haven't turned this match into a URL, something's gone wrong
196	if (found_iw_match == False) or (iw_url == ""):
197	if not name_printed and not debug:
198	pywikibot.stdout('From page "{}":'.format(page_name))
199	name_printed = 1
200	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(link_text))
201	continue
202
203	# Test the URL
204	iw_url = iw_url.replace(' ', '_')
205	if debug: pywikibot.stdout(' Reading page at {}...'.format(iw_url))
206	response = fetch(iw_url)
207
208	# Redirects are followed automatically by fetch() and treated as "200"s; the way we can
209	# tell that a redirect occurred is by checking fetch's history
210	if response.history != []:
211	if not name_printed and not debug:
212	pywikibot.stdout('From page "{}":'.format(page_name))
213	name_printed = 1
214	pywikibot.stdout(' ADVICE: Got redirection code ({0}) on URL "{1}". You should check the link manually.'.format(response.history[0], iw_url))
215	advice_issued += 1
216	elif response.status_code != 200:
217	if not name_printed and not debug:
218	pywikibot.stdout('From page "{}":'.format(page_name))
219	name_printed = 1
220	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
221	errors_issued += 1
222	else:
223	# Isolate section link
224	pre_section, section_name = link_text.split('#', 1)
225	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(section_name))
226
227	# Convert slash character to the dot-notation hex encoding that MediaWiki uses
228	section_name = section_name.replace('/', '.2F')
229
230	# Read linked page to see if it really has this anchor link
231	soup = BeautifulSoup(response.text, 'html.parser')
232	found_section = False
233	for span_tag in soup.findAll('span'):
234	span_name = span_tag.get('id', None)
235	if span_name == section_name:
236	if debug: pywikibot.stdout(' Found section!')
237	found_section = True
238	break
239	if found_section == False:
240	if not name_printed and not debug:
241	pywikibot.stdout('From page "{}":'.format(page_name))
242	name_printed = 1
243	pywikibot.stdout(' ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
244	errors_issued += 1
245
246	def main(*args):
247	global debug
248	global pages_checked
249	global iw_found
250	global advice_issued
251	global errors_issued
252	search_cat = ''
253	search_page = ''
254
255	local_args = pywikibot.handle_args(args)
256	genFactory = pagegenerators.GeneratorFactory()
257
258	for arg in local_args:
259	if arg.startswith('-cat:'):
260	search_cat = arg[5:]
261	elif arg.startswith('-page:'):
262	search_page = arg[6:]
263	elif arg == '-dbg':
264	debug = 1
265	else:
266	pywikibot.stdout('Unknown argument "{}".'.format(arg))
267	return
268
269	site = pywikibot.Site()
270
271	# This line of code enumerates the methods in the 'page' class
272	#pywikibot.stdout(format(dir(page)))
273
274	if search_cat != '':
275	cat_obj = pywikibot.Category(site, search_cat)
276	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
277	for page in pagegenerators.PreloadingGenerator(generator, 100):
278	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
279	scan_for_intrawiki_links(page.text, page.title())
280	elif search_page != '':
281	page = pywikibot.Page(site, search_page)
282	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
283	scan_for_intrawiki_links(page.text, page.title())
284
285	page_str = "pages"
286	if pages_checked == 1:
287	page_str = "page"
288
289	link_str = "links"
290	if iw_found == 1:
291	link_str = "link"
292
293	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
294	pywikibot.stdout('While attempting to follow section links...')
295
296	if advice_issued == 0:
297	pywikibot.stdout(' No advice on potential problems was issued.')
298	elif advice_issued == 1:
299	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
300	else:
301	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
302
303	error_str = "errors were"
304	if errors_issued == 1:
305	error_str = "error was"
306	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
307
308	if __name__ == '__main__':
309	main()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: