Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1180

Last change on this file since 1180 was 1179, checked in by iritscen, 19 months ago
ValBot: check_intrawiki_section_links.py: Simplified output to just advice and errors. Added support for SectionLink template. Added support for links built on chapter name transclusion. Placed verbose output under a "-dbg" argument.
File size: 14.7 KB

Rev	Line
[1171]	1	# Check Intrawiki Section Links
	2	# by iritscen@yahoo.com
	3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
[1179]	4	# and loads the linked page and verifies that the named section actually exists. It also
	5	# understands section links generated through a call to Template:SectionLink.
[1171]	6	# Recommended viewing width:
	7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
	8
[1169]	9	import os
	10
	11	from urllib.parse import urljoin
	12
	13	import pywikibot
	14	import re
	15
	16	from pywikibot.bot import QuitKeyboardInterrupt
	17	from pywikibot import pagegenerators
	18	from pywikibot.tools.formatter import color_format
	19	from pywikibot.comms.http import fetch
	20	from pywikibot.specialbots import UploadRobot
	21	from bs4 import BeautifulSoup
	22
[1179]	23	# Tuple of OniGalore's namespaces
[1169]	24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
	25
	26	# URL for main namespace of our wiki
	27	onigalore_url = 'https://wiki.oni2.net/'
	28
[1179]	29	# Tuple of interwiki prefixes, for passing over such links
[1169]	30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
	31
[1179]	32	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
	33	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
	34
	35	# Tuple of patterns for recognizing wikilinks
	36	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
	37	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
	38	link_patterns = ("\[\[[^\|\]](\\|\|\])", "\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
	39
	40	# Initialize globals
	41	debug = 0
[1169]	42	pages_checked = 0
	43	iw_found = 0
[1171]	44	advice_issued = 0
	45	errors_issued = 0
[1169]	46
	47	# Searches the given page text for intrawiki links with section links in them
[1179]	48	def scan_for_intrawiki_links(page_text, page_name):
	49	global debug
	50	global pages_checked
	51	global iw_found
	52	global advice_issued
	53	global errors_issued
	54	pages_checked += 1
	55	name_printed = 0
[1169]	56
[1179]	57	for i, the_pattern in enumerate(link_patterns):
	58	if debug:
	59	if i == 0:
	60	pywikibot.stdout(' Checking page for wikilinks with section names.')
	61	elif i == 1:
	62	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
	63
	64	for match in re.finditer(the_pattern, page_text):
	65	found_iw_match = False
	66	iw_url = ""
	67	page_name2 = page_name
	68
	69	# Cut out the matched text from the page, isolating just the page+section name
	70	target_start = 2 # "[["
	71	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
	72	if i == 1:
	73	target_start = 14 # "{{SectionLink\|"
	74	target_end = 2 # "}}"
	75	s = match.start() + target_start # remove the link-opening markup
	76	e = match.end() - target_end # remove the link-ending markup
	77	link_text = page_text[s:e]
	78
	79	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
	80	if i == 1:
	81	link_text = link_text.replace('\|', '#')
[1169]	82
[1179]	83	# Sometimes we use a space char. instead of a '_', so fix that before querying
	84	link_text = link_text.replace(' ', '_')
	85	if debug: pywikibot.stdout(' Found link {0}.'.format(link_text))
	86
	87	# If this link doesn't have a section link in it, then we don't care about it, as
	88	# MediaWiki takes care of checking basic intrawiki links
	89	if not '#' in link_text:
	90	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
[1169]	91	continue
[1176]	92
[1179]	93	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
	94	# for the task of checking interwiki page+section links
	95	is_interwiki = False
	96	if found_iw_match == False:
[1176]	97	for prefix in interwiki_prefixes:
[1179]	98	if prefix + ":" in link_text:
	99	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(link_text))
	100	is_interwiki = True
	101	break
	102	if is_interwiki:
[1176]	103	continue
[1179]	104
	105	# If there is a '{' in the link, then probably it's a link built on transcluded text
	106	# like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
	107	if '{' in link_text:
	108	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
	109	ch_link = ch_link_pattern.search(link_text)
	110	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
	111	if ch_link:
	112	ch_link_match = ch_link.group(0)
	113	ch_num_pattern = re.compile("[0-9]+")
	114	ch_num = ch_num_pattern.search(ch_link_match)
	115	if ch_num:
	116	ch_num_match = int(ch_num.group(0))
	117	if ch_num_match >= 0 and ch_num_match <= 14:
	118	ch_name = chapter_names[ch_num_match]
	119	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
	120	link_text = replace_pattern.sub(ch_name, link_text)
	121	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(link_text))
	122	else:
	123	if not name_printed and not debug:
	124	pywikibot.stdout('From page "{}":'.format(page_name))
	125	name_printed = 1
	126	pywikibot.stdout(' ADVICE: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(link_text, ch_num_match))
	127	advice_issued += 1
	128	continue
	129	else:
	130	if not name_printed and not debug:
	131	pywikibot.stdout('From page "{}":'.format(page_name))
	132	name_printed = 1
	133	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(link_text))
	134	advice_issued += 1
	135	continue
	136	else:
	137	if not name_printed and not debug:
	138	pywikibot.stdout('From page "{}":'.format(page_name))
	139	name_printed = 1
	140	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(link_text, "{{C7}}"))
	141	advice_issued += 1
	142	continue
[1171]	143
[1179]	144	# If this is a relative "/" link, use the current page as the basis for the URL. Note
	145	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
	146	# we're out of luck.
	147	if link_text.startswith('/'):
[1171]	148	link_text = page_name + link_text
[1179]	149	if debug: pywikibot.stdout(' Changed link_text to {} on account of "/".'.format(link_text))
	150
	151	# If this is a relative "../" link, find the parent page, set ourselves to that page,
	152	# then remove the relative portion of the link. Note that this is only performed once,
	153	# so if there's multiple steps back ("../../"), we're out of luck.
	154	if link_text.startswith('../'):
[1169]	155	last_slash = page_name.rfind('/')
	156	page_name2 = page_name[0:last_slash]
[1179]	157	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
[1169]	158	link_text = link_text[3:len(link_text)]
[1179]	159	if debug: pywikibot.stdout(' Changed link_text to {} on account of "../".'.format(link_text))
[1171]	160	# If this is now going to be a bare section link for the parent page, don't add a
	161	# slash, otherwise do because we are drilling down to another subpage
[1169]	162	if link_text.startswith('#'):
[1179]	163	link_text = page_name2 + link_text
[1169]	164	else:
[1179]	165	link_text = page_name2 + '/' + link_text
	166
	167	# If this is a bare section link, build URL based on this page
	168	if link_text.startswith('#'):
[1169]	169	iw_url = onigalore_url + page_name2
[1179]	170	iw_found += 1
	171	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(link_text))
[1169]	172	found_iw_match = True
	173	link_text = page_name2 + link_text
[1179]	174
	175	# If there's no ":" in the link (before the section link, where a colon would just be
	176	# part of the text) then it's a Main namespace article; proceed with building URL
	177	if found_iw_match == False:
[1169]	178	if not re.search(":.*#", link_text):
[1179]	179	iw_url = onigalore_url + link_text
	180	iw_found += 1
	181	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
	182	found_iw_match = True
	183
	184	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
	185	# before building URL
	186	if found_iw_match == False:
[1169]	187	for prefix in intrawiki_prefixes:
[1179]	188	if prefix + ":" in link_text:
	189	iw_url = onigalore_url + link_text
	190	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
	191	iw_found += 1
	192	found_iw_match = True
	193	break
	194
	195	# If we still haven't turned this match into a URL, something's gone wrong
	196	if (found_iw_match == False) or (iw_url == ""):
	197	if not name_printed and not debug:
	198	pywikibot.stdout('From page "{}":'.format(page_name))
	199	name_printed = 1
	200	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(link_text))
[1173]	201	continue
[1169]	202
[1179]	203	# Test the URL
	204	iw_url = iw_url.replace(' ', '_')
	205	if debug: pywikibot.stdout(' Reading page at {}...'.format(iw_url))
	206	response = fetch(iw_url)
[1169]	207
[1179]	208	# Redirects are followed automatically by fetch() and treated as "200"s; the way we can
	209	# tell that a redirect occurred is by checking fetch's history
	210	if response.history != []:
	211	if not name_printed and not debug:
	212	pywikibot.stdout('From page "{}":'.format(page_name))
	213	name_printed = 1
	214	pywikibot.stdout(' ADVICE: Got redirection code ({0}) on URL "{1}". You should check the link manually.'.format(response.history[0], iw_url))
	215	advice_issued += 1
	216	elif response.status_code != 200:
	217	if not name_printed and not debug:
	218	pywikibot.stdout('From page "{}":'.format(page_name))
	219	name_printed = 1
	220	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
	221	errors_issued += 1
	222	else:
[1169]	223	# Isolate section link
	224	pre_section, section_name = link_text.split('#', 1)
[1179]	225	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(section_name))
	226
[1169]	227	# Convert slash character to the dot-notation hex encoding that MediaWiki uses
	228	section_name = section_name.replace('/', '.2F')
[1179]	229
[1169]	230	# Read linked page to see if it really has this anchor link
	231	soup = BeautifulSoup(response.text, 'html.parser')
	232	found_section = False
	233	for span_tag in soup.findAll('span'):
[1179]	234	span_name = span_tag.get('id', None)
	235	if span_name == section_name:
	236	if debug: pywikibot.stdout(' Found section!')
	237	found_section = True
	238	break
[1169]	239	if found_section == False:
[1179]	240	if not name_printed and not debug:
	241	pywikibot.stdout('From page "{}":'.format(page_name))
	242	name_printed = 1
	243	pywikibot.stdout(' ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
	244	errors_issued += 1
[1169]	245
	246	def main(*args):
[1179]	247	global debug
	248	global pages_checked
	249	global iw_found
	250	global advice_issued
	251	global errors_issued
	252	search_cat = ''
	253	search_page = ''
[1169]	254
[1179]	255	local_args = pywikibot.handle_args(args)
	256	genFactory = pagegenerators.GeneratorFactory()
[1169]	257
[1179]	258	for arg in local_args:
	259	if arg.startswith('-cat:'):
	260	search_cat = arg[5:]
	261	elif arg.startswith('-page:'):
	262	search_page = arg[6:]
	263	elif arg == '-dbg':
	264	debug = 1
	265	else:
	266	pywikibot.stdout('Unknown argument "{}".'.format(arg))
	267	return
[1169]	268
[1179]	269	site = pywikibot.Site()
[1169]	270
[1179]	271	# This line of code enumerates the methods in the 'page' class
	272	#pywikibot.stdout(format(dir(page)))
[1169]	273
[1179]	274	if search_cat != '':
	275	cat_obj = pywikibot.Category(site, search_cat)
	276	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
	277	for page in pagegenerators.PreloadingGenerator(generator, 100):
	278	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
	279	scan_for_intrawiki_links(page.text, page.title())
	280	elif search_page != '':
	281	page = pywikibot.Page(site, search_page)
	282	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
	283	scan_for_intrawiki_links(page.text, page.title())
[1169]	284
[1179]	285	page_str = "pages"
	286	if pages_checked == 1:
	287	page_str = "page"
[1169]	288
[1179]	289	link_str = "links"
	290	if iw_found == 1:
	291	link_str = "link"
[1171]	292
[1179]	293	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
	294	pywikibot.stdout('While attempting to follow section links...')
[1171]	295
[1179]	296	if advice_issued == 0:
	297	pywikibot.stdout(' No advice on potential problems was issued.')
	298	elif advice_issued == 1:
	299	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
	300	else:
	301	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
[1171]	302
[1179]	303	error_str = "errors were"
	304	if errors_issued == 1:
	305	error_str = "error was"
	306	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
[1171]	307
[1169]	308	if __name__ == '__main__':
[1179]	309	main()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: