Context Navigation

check_intrawiki_section_links.py@ 1194

Last change on this file since 1194 was 1194, checked in by iritscen, 4 months ago
ValBot: check_intrawiki_section_links.py now understands text fragment directives.
File size: 19.4 KB

Rev	Line
[1171]	1	# Check Intrawiki Section Links
	2	# by iritscen@yahoo.com
	3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
[1179]	4	# and loads the linked page and verifies that the named section actually exists. It also
	5	# understands section links generated through a call to Template:SectionLink.
[1171]	6	# Recommended viewing width:
	7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
	8
[1169]	9	import os
	10
	11	from urllib.parse import urljoin
	12
	13	import pywikibot
	14	import re
	15
	16	from pywikibot.bot import QuitKeyboardInterrupt
	17	from pywikibot import pagegenerators
	18	from pywikibot.tools.formatter import color_format
	19	from pywikibot.comms.http import fetch
	20	from pywikibot.specialbots import UploadRobot
	21	from bs4 import BeautifulSoup
	22
[1179]	23	# Tuple of OniGalore's namespaces
[1169]	24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
	25
	26	# URL for main namespace of our wiki
	27	onigalore_url = 'https://wiki.oni2.net/'
	28
[1185]	29	# Tuple of interwiki prefixes, for recognizing and passing over such links
[1169]	30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
	31
[1179]	32	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
	33	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
	34
	35	# Tuple of patterns for recognizing wikilinks
	36	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
	37	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
[1192]	38	link_patterns = (r"\[\[[^\|\]](\\|\|\])", r"\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
[1179]	39
	40	# Initialize globals
	41	debug = 0
[1169]	42	pages_checked = 0
	43	iw_found = 0
[1171]	44	advice_issued = 0
	45	errors_issued = 0
[1185]	46	name_printed = 0
[1169]	47
[1185]	48	# Prints the name of a page on which something occurred, if it has not been printed before
	49	def possibly_print(page_name):
	50	global debug
	51	global name_printed
	52
	53	if not name_printed and not debug:
	54	pywikibot.stdout('')
	55	pywikibot.stdout('From page "{}":'.format(page_name))
	56	name_printed = 1
	57
	58	# Search a page for the section specified in the link
	59	def find_section(page_text, page_name, page_slug, print_result):
	60	global errors_issued
[1194]	61	found_section = False
	62
	63	# Isolate section link or text fragment link
[1185]	64	target_page_name, anchor_name = page_slug.split('#', 1)
	65	target_page_name_human = target_page_name.replace('_', ' ')
[1194]	66
	67	# First check if this is a text fragment directive, and look for it if so
	68	if anchor_name.startswith(':~:text='):
	69	if debug: pywikibot.stdout(' Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug))
	70	anchor_name = anchor_name[8:]
	71	# We're only checking the first text directive, so strip add'l ones if present
	72	addl_fragment = anchor_name.find('&text=')
	73	if addl_fragment != -1:
	74	anchor_name = anchor_name[:addl_fragment]
	75	search_terms = anchor_name.split(',')
	76	# Delete prefix and suffix terms because they aren't needed
	77	if search_terms[0].endswith('-'):
	78	search_terms.pop(0)
	79	if search_terms[-1].startswith('-'):
	80	search_terms.pop()
	81	# Remake text directive with the terms separated by spaces as they should be in the page text
	82	newSep = ' '
	83	search_string = newSep.join(search_terms)
	84	if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string))
	85	if search_string in page_text:
[1185]	86	found_section = True
[1194]	87	if debug and not print_result: pywikibot.stdout(' Found text fragment!')
	88
	89	# If we're still here, it's a section link; read linked page to see if it really has this
	90	# anchor link
[1185]	91	if found_section == False:
[1194]	92	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
	93	soup = BeautifulSoup(page_text, 'html.parser')
	94	# Search for a span with this ID
	95	for span_tag in soup.findAll('span'):
	96	span_name = span_tag.get('id', None)
	97	if span_name == anchor_name:
	98	if debug and not print_result: pywikibot.stdout(' Found section in a span!')
	99	found_section = True
	100	break
	101	if found_section == False:
[1185]	102	# Search for a div with this ID
	103	for span_tag in soup.findAll('div'):
	104	span_name = span_tag.get('id', None)
	105	if span_name == anchor_name:
	106	if debug and not print_result: pywikibot.stdout(' Found section in a div!')
	107	found_section = True
	108	break
	109	if found_section == False:
	110	possibly_print(page_name)
[1186]	111	pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
[1185]	112	errors_issued += 1
	113	elif debug and print_result:
	114	pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
	115
	116	# For a link that redirected us to another page, extract the name of the target page from
	117	# the target page's source
	118	def find_canonical_link(page_text, page_name, page_slug):
	119	# Extract link from this markup which contains name of redirected-to page:
	120	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
	121	# "wgPageName":"Namespace:Page_name",
	122	canonical_name = page_text.split('"wgPageName":"')[-1]
	123	tag_end = canonical_name.find('",')
	124
	125	if tag_end == -1:
	126	pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
	127	errors_issued = errors_issued + 1
	128	else:
	129	canonical_name = canonical_name[:tag_end]
	130	if len(canonical_name) > 100:
	131	# Certain things can cause the trim to fail; report error and avoid slamming the
	132	# output with massive page source from a failed trim
	133	pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
	134	errors_issued = errors_issued + 1
	135	else:
	136	canonical_name = canonical_name.replace('_', ' ')
	137	if '#' in page_slug:
	138	_, anchor_name = page_slug.split('#')
	139	if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
	140	find_section(page_text, page_name, page_slug, True)
	141	else:
	142	pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
	143
	144	# Test an intrawiki link and look for a section link if applicable
	145	def test_intrawiki_link(iw_url, page_name, page_slug):
	146	global advice_issued
	147	global errors_issued
	148
	149	response = fetch(iw_url)
	150
	151	# One way we tell that a redirect occurred is by checking fetch's history, as it
	152	# automatically follows redirects. This will catch formal redirects which come from pages
	153	# such as Special:PermanentLink.
	154	if response.history != []:
	155	permalink1 = 'Special:PermanentLink/'.lower()
	156	permalink2 = 'Special:Permalink/'.lower()
	157	page_slug_lower = page_slug.lower()
	158	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
	159	if debug:
	160	possibly_print(page_name)
	161	pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
	162	find_canonical_link(response.text, page_name, page_slug)
	163	else:
	164	possibly_print(page_name)
	165	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
	166	advice_issued += 1
	167	elif response.status_code != 200:
	168	possibly_print(page_name)
	169	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
	170	errors_issued += 1
	171	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
	172	# using JavaScript, while returning code OK 200 as if the link was correct; this happens
	173	# when a redirect page is accessed. We must detect these soft redirects by looking at the
	174	# page source to find the redirect note inserted at the top of the page for the reader.
	175	elif 'Redirected from <a' in response.text:
	176	if debug:
	177	possibly_print(page_name)
	178	pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
	179	find_canonical_link(response.text, page_name, page_slug)
	180	else: # URL is OK, so proceed
	181	find_section(response.text, page_name, page_slug, False)
	182
[1169]	183	# Searches the given page text for intrawiki links with section links in them
[1179]	184	def scan_for_intrawiki_links(page_text, page_name):
	185	global debug
	186	global pages_checked
	187	global iw_found
	188	global advice_issued
	189	global errors_issued
[1185]	190	global name_printed
[1179]	191	pages_checked += 1
	192	name_printed = 0
[1169]	193
[1179]	194	for i, the_pattern in enumerate(link_patterns):
	195	if debug:
	196	if i == 0:
	197	pywikibot.stdout(' Checking page for wikilinks with section names.')
	198	elif i == 1:
	199	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
	200
	201	for match in re.finditer(the_pattern, page_text):
	202	found_iw_match = False
	203	iw_url = ""
	204	page_name2 = page_name
	205
	206	# Cut out the matched text from the page, isolating just the page+section name
	207	target_start = 2 # "[["
	208	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
	209	if i == 1:
	210	target_start = 14 # "{{SectionLink\|"
	211	target_end = 2 # "}}"
	212	s = match.start() + target_start # remove the link-opening markup
	213	e = match.end() - target_end # remove the link-ending markup
[1185]	214	page_slug = page_text[s:e]
[1179]	215
	216	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
	217	if i == 1:
[1185]	218	page_slug = page_slug.replace('\|', '#')
[1169]	219
[1179]	220	# Sometimes we use a space char. instead of a '_', so fix that before querying
[1185]	221	page_slug = page_slug.replace(' ', '_')
	222	if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
[1179]	223
	224	# If this link doesn't have a section link in it, then we don't care about it, as
	225	# MediaWiki takes care of checking basic intrawiki links
[1185]	226	if not '#' in page_slug:
[1179]	227	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
[1169]	228	continue
[1176]	229
[1179]	230	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
	231	# for the task of checking interwiki page+section links
	232	is_interwiki = False
	233	if found_iw_match == False:
[1176]	234	for prefix in interwiki_prefixes:
[1185]	235	if prefix + ":" in page_slug:
	236	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
[1179]	237	is_interwiki = True
	238	break
	239	if is_interwiki:
[1176]	240	continue
[1179]	241
[1185]	242	# If there is a '{' in the link, then probably it's a link built on transcluded text.
	243	# If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
	244	# our "chapter_names" array. If it's another type of transclusion, punt it to the user.
	245	if '{' in page_slug:
[1179]	246	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
[1185]	247	ch_link = ch_link_pattern.search(page_slug)
[1179]	248	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
	249	if ch_link:
	250	ch_link_match = ch_link.group(0)
	251	ch_num_pattern = re.compile("[0-9]+")
	252	ch_num = ch_num_pattern.search(ch_link_match)
	253	if ch_num:
	254	ch_num_match = int(ch_num.group(0))
	255	if ch_num_match >= 0 and ch_num_match <= 14:
	256	ch_name = chapter_names[ch_num_match]
	257	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
[1185]	258	page_slug = replace_pattern.sub(ch_name, page_slug)
	259	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
[1179]	260	else:
[1185]	261	possibly_print(page_name)
	262	pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
	263	errors_issued += 1
[1179]	264	continue
	265	else:
[1185]	266	possibly_print(page_name)
	267	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
[1179]	268	advice_issued += 1
	269	continue
	270	else:
[1185]	271	possibly_print(page_name)
	272	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
[1179]	273	advice_issued += 1
	274	continue
[1171]	275
[1179]	276	# If this is a relative "/" link, use the current page as the basis for the URL. Note
	277	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
	278	# we're out of luck.
[1185]	279	if page_slug.startswith('/'):
	280	page_slug = page_name + page_slug
	281	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
[1179]	282
	283	# If this is a relative "../" link, find the parent page, set ourselves to that page,
	284	# then remove the relative portion of the link. Note that this is only performed once,
	285	# so if there's multiple steps back ("../../"), we're out of luck.
[1185]	286	if page_slug.startswith('../'):
[1169]	287	last_slash = page_name.rfind('/')
	288	page_name2 = page_name[0:last_slash]
[1179]	289	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
[1185]	290	page_slug = page_slug[3:len(page_slug)]
	291	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
[1171]	292	# If this is now going to be a bare section link for the parent page, don't add a
	293	# slash, otherwise do because we are drilling down to another subpage
[1185]	294	if page_slug.startswith('#'):
	295	page_slug = page_name2 + page_slug
[1169]	296	else:
[1185]	297	page_slug = page_name2 + '/' + page_slug
[1179]	298
	299	# If this is a bare section link, build URL based on this page
[1185]	300	if page_slug.startswith('#'):
[1169]	301	iw_url = onigalore_url + page_name2
[1179]	302	iw_found += 1
[1185]	303	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
[1169]	304	found_iw_match = True
[1185]	305	page_slug = page_name2 + page_slug
[1179]	306
	307	# If there's no ":" in the link (before the section link, where a colon would just be
	308	# part of the text) then it's a Main namespace article; proceed with building URL
	309	if found_iw_match == False:
[1185]	310	if not re.search(":.*#", page_slug):
	311	iw_url = onigalore_url + page_slug
[1179]	312	iw_found += 1
	313	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
	314	found_iw_match = True
	315
	316	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
	317	# before building URL
	318	if found_iw_match == False:
[1169]	319	for prefix in intrawiki_prefixes:
[1185]	320	if prefix + ":" in page_slug:
	321	iw_url = onigalore_url + page_slug
[1179]	322	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
	323	iw_found += 1
	324	found_iw_match = True
	325	break
	326
	327	# If we still haven't turned this match into a URL, something's gone wrong
	328	if (found_iw_match == False) or (iw_url == ""):
[1185]	329	possibly_print(page_name)
	330	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
[1173]	331	continue
[1169]	332
[1179]	333	# Test the URL
	334	iw_url = iw_url.replace(' ', '_')
[1185]	335	if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
	336	test_intrawiki_link(iw_url, page_name, page_slug)
[1169]	337
[1185]	338	# Print a wrap-up message
	339	def print_summary():
[1179]	340	global pages_checked
	341	global iw_found
	342	global advice_issued
	343	global errors_issued
[1185]	344
	345	page_str = "pages"
	346	if pages_checked == 1:
	347	page_str = "page"
	348
	349	link_str = "links"
	350	if iw_found == 1:
	351	link_str = "link"
	352
	353	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
	354	pywikibot.stdout('While attempting to follow section links….')
	355
	356	if advice_issued == 0:
	357	pywikibot.stdout(' No advice on potential problems was issued.')
	358	elif advice_issued == 1:
	359	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
	360	else:
	361	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
	362
	363	error_str = "errors were"
	364	if errors_issued == 1:
	365	error_str = "error was"
	366	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
	367
	368	# Main function
	369	def main(*args):
	370	global debug
[1179]	371	search_cat = ''
	372	search_page = ''
[1169]	373
[1185]	374	# Process arguments
[1179]	375	local_args = pywikibot.handle_args(args)
	376	for arg in local_args:
	377	if arg.startswith('-cat:'):
	378	search_cat = arg[5:]
	379	elif arg.startswith('-page:'):
	380	search_page = arg[6:]
	381	elif arg == '-dbg':
	382	debug = 1
	383	else:
	384	pywikibot.stdout('Unknown argument "{}".'.format(arg))
	385	return
[1169]	386
[1179]	387	site = pywikibot.Site()
[1169]	388
[1179]	389	# This line of code enumerates the methods in the 'page' class
	390	#pywikibot.stdout(format(dir(page)))
[1169]	391
[1185]	392	# Check specified page or loop through specified category and check all pages
[1179]	393	if search_cat != '':
	394	cat_obj = pywikibot.Category(site, search_cat)
	395	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
	396	for page in pagegenerators.PreloadingGenerator(generator, 100):
	397	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
	398	scan_for_intrawiki_links(page.text, page.title())
	399	elif search_page != '':
	400	page = pywikibot.Page(site, search_page)
	401	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
	402	scan_for_intrawiki_links(page.text, page.title())
[1169]	403
[1185]	404	# Print the results
	405	print_summary()
[1169]	406
	407	if __name__ == '__main__':
[1179]	408	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1194

Download in other formats: