Context Navigation

check_intrawiki_section_links.py@ 1211

Last change on this file since 1211 was 1205, checked in by iritscen, 3 months ago
ValBot: Revised logic in check_intrawiki_section_links.py as MediaWiki now apparently returns response 301 when the user is redirected by a redirect page.
File size: 19.2 KB

Rev	Line
[1171]	1	# Check Intrawiki Section Links
	2	# by iritscen@yahoo.com
	3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
[1179]	4	# and loads the linked page and verifies that the named section actually exists. It also
	5	# understands section links generated through a call to Template:SectionLink.
[1171]	6	# Recommended viewing width:
	7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
	8
[1169]	9	import os
	10
	11	from urllib.parse import urljoin
	12
	13	import pywikibot
	14	import re
	15
	16	from pywikibot.bot import QuitKeyboardInterrupt
	17	from pywikibot import pagegenerators
	18	from pywikibot.comms.http import fetch
	19	from pywikibot.specialbots import UploadRobot
	20	from bs4 import BeautifulSoup
	21
[1179]	22	# Tuple of OniGalore's namespaces
[1169]	23	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
	24
	25	# URL for main namespace of our wiki
	26	onigalore_url = 'https://wiki.oni2.net/'
	27
[1185]	28	# Tuple of interwiki prefixes, for recognizing and passing over such links
[1169]	29	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
	30
[1179]	31	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
	32	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
	33
	34	# Tuple of patterns for recognizing wikilinks
	35	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
	36	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
[1192]	37	link_patterns = (r"\[\[[^\|\]](\\|\|\])", r"\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
[1179]	38
	39	# Initialize globals
	40	debug = 0
[1169]	41	pages_checked = 0
	42	iw_found = 0
[1171]	43	advice_issued = 0
	44	errors_issued = 0
[1185]	45	name_printed = 0
[1169]	46
[1185]	47	# Prints the name of a page on which something occurred, if it has not been printed before
	48	def possibly_print(page_name):
	49	global debug
	50	global name_printed
	51
	52	if not name_printed and not debug:
	53	pywikibot.stdout('')
	54	pywikibot.stdout('From page "{}":'.format(page_name))
	55	name_printed = 1
	56
	57	# Search a page for the section specified in the link
	58	def find_section(page_text, page_name, page_slug, print_result):
	59	global errors_issued
[1194]	60	found_section = False
	61
	62	# Isolate section link or text fragment link
[1185]	63	target_page_name, anchor_name = page_slug.split('#', 1)
	64	target_page_name_human = target_page_name.replace('_', ' ')
[1194]	65
	66	# First check if this is a text fragment directive, and look for it if so
	67	if anchor_name.startswith(':~:text='):
	68	if debug: pywikibot.stdout(' Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug))
	69	anchor_name = anchor_name[8:]
	70	# We're only checking the first text directive, so strip add'l ones if present
	71	addl_fragment = anchor_name.find('&text=')
	72	if addl_fragment != -1:
	73	anchor_name = anchor_name[:addl_fragment]
	74	search_terms = anchor_name.split(',')
	75	# Delete prefix and suffix terms because they aren't needed
	76	if search_terms[0].endswith('-'):
	77	search_terms.pop(0)
	78	if search_terms[-1].startswith('-'):
	79	search_terms.pop()
[1205]	80	# Remake text directive with the terms separated by spaces as they should be in the page
	81	# text
[1194]	82	newSep = ' '
	83	search_string = newSep.join(search_terms)
	84	if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string))
	85	if search_string in page_text:
[1185]	86	found_section = True
[1194]	87	if debug and not print_result: pywikibot.stdout(' Found text fragment!')
	88
	89	# If we're still here, it's a section link; read linked page to see if it really has this
	90	# anchor link
[1185]	91	if found_section == False:
[1194]	92	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
	93	soup = BeautifulSoup(page_text, 'html.parser')
	94	# Search for a span with this ID
	95	for span_tag in soup.findAll('span'):
	96	span_name = span_tag.get('id', None)
	97	if span_name == anchor_name:
	98	if debug and not print_result: pywikibot.stdout(' Found section in a span!')
	99	found_section = True
	100	break
	101	if found_section == False:
[1185]	102	# Search for a div with this ID
	103	for span_tag in soup.findAll('div'):
	104	span_name = span_tag.get('id', None)
	105	if span_name == anchor_name:
	106	if debug and not print_result: pywikibot.stdout(' Found section in a div!')
	107	found_section = True
	108	break
	109	if found_section == False:
	110	possibly_print(page_name)
[1186]	111	pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
[1185]	112	errors_issued += 1
	113	elif debug and print_result:
	114	pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
	115
[1205]	116	# For a link that redirected us to another page, extract the name of the target page from the
	117	# target page's source
[1185]	118	def find_canonical_link(page_text, page_name, page_slug):
	119	# Extract link from this markup which contains name of redirected-to page:
	120	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
	121	# "wgPageName":"Namespace:Page_name",
	122	canonical_name = page_text.split('"wgPageName":"')[-1]
	123	tag_end = canonical_name.find('",')
	124
	125	if tag_end == -1:
	126	pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
	127	errors_issued = errors_issued + 1
	128	else:
	129	canonical_name = canonical_name[:tag_end]
	130	if len(canonical_name) > 100:
[1205]	131	# Certain things can cause the trim to fail; report error and avoid slamming the output
	132	# with massive page source from a failed trim
[1185]	133	pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
	134	errors_issued = errors_issued + 1
	135	else:
	136	canonical_name = canonical_name.replace('_', ' ')
	137	if '#' in page_slug:
	138	_, anchor_name = page_slug.split('#')
	139	if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
	140	find_section(page_text, page_name, page_slug, True)
	141	else:
	142	pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
	143
	144	# Test an intrawiki link and look for a section link if applicable
	145	def test_intrawiki_link(iw_url, page_name, page_slug):
	146	global advice_issued
	147	global errors_issued
	148
	149	response = fetch(iw_url)
	150
	151	# One way we tell that a redirect occurred is by checking fetch's history, as it
	152	# automatically follows redirects. This will catch formal redirects which come from pages
	153	# such as Special:PermanentLink.
[1205]	154	permalink1 = 'Special:PermanentLink/'.lower()
	155	permalink2 = 'Special:Permalink/'.lower()
	156	page_slug_lower = page_slug.lower()
	157	if response.history != [] and (page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2)):
	158	if debug:
[1185]	159	possibly_print(page_name)
[1205]	160	pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
	161	find_canonical_link(response.text, page_name, page_slug)
	162	# However the usual way that a redirect occurs is that a redirect page is visited and
	163	# MediaWiki sends us to the new page using JavaScript while returning code 301. Formerly it
	164	# used to return 200 as if the link was correct, so rather than looking for code 301 we
	165	# detect these soft redirects by looking at the page source to find the redirect note that
	166	# gets inserted at the top of the page for the reader.
[1185]	167	elif 'Redirected from <a' in response.text:
	168	if debug:
	169	possibly_print(page_name)
	170	pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
	171	find_canonical_link(response.text, page_name, page_slug)
[1205]	172	# This handles response codes other than 200 and 301 (301 is returned in the above case of a
	173	# silent redirect)
	174	elif response.status_code != 200:
	175	possibly_print(page_name)
	176	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
	177	errors_issued += 1
[1185]	178	else: # URL is OK, so proceed
	179	find_section(response.text, page_name, page_slug, False)
	180
[1169]	181	# Searches the given page text for intrawiki links with section links in them
[1179]	182	def scan_for_intrawiki_links(page_text, page_name):
	183	global debug
	184	global pages_checked
	185	global iw_found
	186	global advice_issued
	187	global errors_issued
[1185]	188	global name_printed
[1179]	189	pages_checked += 1
	190	name_printed = 0
[1169]	191
[1179]	192	for i, the_pattern in enumerate(link_patterns):
	193	if debug:
	194	if i == 0:
	195	pywikibot.stdout(' Checking page for wikilinks with section names.')
	196	elif i == 1:
	197	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
	198
	199	for match in re.finditer(the_pattern, page_text):
	200	found_iw_match = False
	201	iw_url = ""
	202	page_name2 = page_name
	203
	204	# Cut out the matched text from the page, isolating just the page+section name
	205	target_start = 2 # "[["
	206	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
	207	if i == 1:
	208	target_start = 14 # "{{SectionLink\|"
	209	target_end = 2 # "}}"
	210	s = match.start() + target_start # remove the link-opening markup
	211	e = match.end() - target_end # remove the link-ending markup
[1185]	212	page_slug = page_text[s:e]
[1179]	213
	214	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
	215	if i == 1:
[1185]	216	page_slug = page_slug.replace('\|', '#')
[1169]	217
[1179]	218	# Sometimes we use a space char. instead of a '_', so fix that before querying
[1185]	219	page_slug = page_slug.replace(' ', '_')
	220	if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
[1179]	221
	222	# If this link doesn't have a section link in it, then we don't care about it, as
	223	# MediaWiki takes care of checking basic intrawiki links
[1185]	224	if not '#' in page_slug:
[1179]	225	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
[1169]	226	continue
[1176]	227
[1179]	228	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
	229	# for the task of checking interwiki page+section links
	230	is_interwiki = False
	231	if found_iw_match == False:
[1176]	232	for prefix in interwiki_prefixes:
[1185]	233	if prefix + ":" in page_slug:
	234	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
[1179]	235	is_interwiki = True
	236	break
	237	if is_interwiki:
[1176]	238	continue
[1179]	239
[1185]	240	# If there is a '{' in the link, then probably it's a link built on transcluded text.
	241	# If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
	242	# our "chapter_names" array. If it's another type of transclusion, punt it to the user.
	243	if '{' in page_slug:
[1179]	244	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
[1185]	245	ch_link = ch_link_pattern.search(page_slug)
[1179]	246	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
	247	if ch_link:
	248	ch_link_match = ch_link.group(0)
	249	ch_num_pattern = re.compile("[0-9]+")
	250	ch_num = ch_num_pattern.search(ch_link_match)
	251	if ch_num:
	252	ch_num_match = int(ch_num.group(0))
	253	if ch_num_match >= 0 and ch_num_match <= 14:
	254	ch_name = chapter_names[ch_num_match]
	255	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
[1185]	256	page_slug = replace_pattern.sub(ch_name, page_slug)
	257	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
[1179]	258	else:
[1185]	259	possibly_print(page_name)
	260	pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
	261	errors_issued += 1
[1179]	262	continue
	263	else:
[1185]	264	possibly_print(page_name)
	265	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
[1179]	266	advice_issued += 1
	267	continue
	268	else:
[1185]	269	possibly_print(page_name)
	270	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
[1179]	271	advice_issued += 1
	272	continue
[1171]	273
[1179]	274	# If this is a relative "/" link, use the current page as the basis for the URL. Note
	275	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
	276	# we're out of luck.
[1185]	277	if page_slug.startswith('/'):
	278	page_slug = page_name + page_slug
	279	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
[1179]	280
	281	# If this is a relative "../" link, find the parent page, set ourselves to that page,
	282	# then remove the relative portion of the link. Note that this is only performed once,
	283	# so if there's multiple steps back ("../../"), we're out of luck.
[1185]	284	if page_slug.startswith('../'):
[1169]	285	last_slash = page_name.rfind('/')
	286	page_name2 = page_name[0:last_slash]
[1179]	287	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
[1185]	288	page_slug = page_slug[3:len(page_slug)]
	289	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
[1171]	290	# If this is now going to be a bare section link for the parent page, don't add a
	291	# slash, otherwise do because we are drilling down to another subpage
[1185]	292	if page_slug.startswith('#'):
	293	page_slug = page_name2 + page_slug
[1169]	294	else:
[1185]	295	page_slug = page_name2 + '/' + page_slug
[1179]	296
	297	# If this is a bare section link, build URL based on this page
[1185]	298	if page_slug.startswith('#'):
[1169]	299	iw_url = onigalore_url + page_name2
[1179]	300	iw_found += 1
[1185]	301	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
[1169]	302	found_iw_match = True
[1185]	303	page_slug = page_name2 + page_slug
[1179]	304
	305	# If there's no ":" in the link (before the section link, where a colon would just be
	306	# part of the text) then it's a Main namespace article; proceed with building URL
	307	if found_iw_match == False:
[1185]	308	if not re.search(":.*#", page_slug):
	309	iw_url = onigalore_url + page_slug
[1179]	310	iw_found += 1
	311	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
	312	found_iw_match = True
	313
	314	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
	315	# before building URL
	316	if found_iw_match == False:
[1169]	317	for prefix in intrawiki_prefixes:
[1185]	318	if prefix + ":" in page_slug:
	319	iw_url = onigalore_url + page_slug
[1179]	320	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
	321	iw_found += 1
	322	found_iw_match = True
	323	break
	324
	325	# If we still haven't turned this match into a URL, something's gone wrong
	326	if (found_iw_match == False) or (iw_url == ""):
[1185]	327	possibly_print(page_name)
	328	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
[1173]	329	continue
[1169]	330
[1179]	331	# Test the URL
	332	iw_url = iw_url.replace(' ', '_')
[1185]	333	if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
	334	test_intrawiki_link(iw_url, page_name, page_slug)
[1169]	335
[1185]	336	# Print a wrap-up message
	337	def print_summary():
[1179]	338	global pages_checked
	339	global iw_found
	340	global advice_issued
	341	global errors_issued
[1185]	342
	343	page_str = "pages"
	344	if pages_checked == 1:
	345	page_str = "page"
	346
	347	link_str = "links"
	348	if iw_found == 1:
	349	link_str = "link"
	350
	351	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
	352	pywikibot.stdout('While attempting to follow section links….')
	353
	354	if advice_issued == 0:
	355	pywikibot.stdout(' No advice on potential problems was issued.')
	356	elif advice_issued == 1:
	357	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
	358	else:
	359	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
	360
	361	error_str = "errors were"
	362	if errors_issued == 1:
	363	error_str = "error was"
	364	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
	365
	366	# Main function
	367	def main(*args):
	368	global debug
[1179]	369	search_cat = ''
	370	search_page = ''
[1169]	371
[1185]	372	# Process arguments
[1179]	373	local_args = pywikibot.handle_args(args)
	374	for arg in local_args:
	375	if arg.startswith('-cat:'):
	376	search_cat = arg[5:]
	377	elif arg.startswith('-page:'):
	378	search_page = arg[6:]
	379	elif arg == '-dbg':
	380	debug = 1
	381	else:
	382	pywikibot.stdout('Unknown argument "{}".'.format(arg))
	383	return
[1169]	384
[1179]	385	site = pywikibot.Site()
[1169]	386
[1179]	387	# This line of code enumerates the methods in the 'page' class
	388	#pywikibot.stdout(format(dir(page)))
[1169]	389
[1185]	390	# Check specified page or loop through specified category and check all pages
[1179]	391	if search_cat != '':
	392	cat_obj = pywikibot.Category(site, search_cat)
	393	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
	394	for page in pagegenerators.PreloadingGenerator(generator, 100):
	395	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
	396	scan_for_intrawiki_links(page.text, page.title())
	397	elif search_page != '':
	398	page = pywikibot.Page(site, search_page)
	399	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
	400	scan_for_intrawiki_links(page.text, page.title())
[1169]	401
[1185]	402	# Print the results
	403	print_summary()
[1169]	404
	405	if __name__ == '__main__':
[1179]	406	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1211

Download in other formats: