Context Navigation

source: ValBot/Python/check_intrawiki_section_links.py@ 1193

Last change on this file since 1193 was 1192, checked in by iritscen, 2 months ago
ValBot: Corrected syntax of two strings with regex patterns.
File size: 18.0 KB

Rev	Line
[1171]	1	# Check Intrawiki Section Links
	2	# by iritscen@yahoo.com
	3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
[1179]	4	# and loads the linked page and verifies that the named section actually exists. It also
	5	# understands section links generated through a call to Template:SectionLink.
[1171]	6	# Recommended viewing width:
	7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
	8
[1169]	9	import os
	10
	11	from urllib.parse import urljoin
	12
	13	import pywikibot
	14	import re
	15
	16	from pywikibot.bot import QuitKeyboardInterrupt
	17	from pywikibot import pagegenerators
	18	from pywikibot.tools.formatter import color_format
	19	from pywikibot.comms.http import fetch
	20	from pywikibot.specialbots import UploadRobot
	21	from bs4 import BeautifulSoup
	22
[1179]	23	# Tuple of OniGalore's namespaces
[1169]	24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
	25
	26	# URL for main namespace of our wiki
	27	onigalore_url = 'https://wiki.oni2.net/'
	28
[1185]	29	# Tuple of interwiki prefixes, for recognizing and passing over such links
[1169]	30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
	31
[1179]	32	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
	33	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
	34
	35	# Tuple of patterns for recognizing wikilinks
	36	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
	37	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
[1192]	38	link_patterns = (r"\[\[[^\|\]](\\|\|\])", r"\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
[1179]	39
	40	# Initialize globals
	41	debug = 0
[1169]	42	pages_checked = 0
	43	iw_found = 0
[1171]	44	advice_issued = 0
	45	errors_issued = 0
[1185]	46	name_printed = 0
[1169]	47
[1185]	48	# Prints the name of a page on which something occurred, if it has not been printed before
	49	def possibly_print(page_name):
	50	global debug
	51	global name_printed
	52
	53	if not name_printed and not debug:
	54	pywikibot.stdout('')
	55	pywikibot.stdout('From page "{}":'.format(page_name))
	56	name_printed = 1
	57
	58	# Search a page for the section specified in the link
	59	def find_section(page_text, page_name, page_slug, print_result):
	60	global errors_issued
	61
	62	# Isolate section link
	63	target_page_name, anchor_name = page_slug.split('#', 1)
	64	target_page_name_human = target_page_name.replace('_', ' ')
	65	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
	66
	67	# Read linked page to see if it really has this anchor link
	68	soup = BeautifulSoup(page_text, 'html.parser')
	69	found_section = False
	70	for span_tag in soup.findAll('span'):
	71	span_name = span_tag.get('id', None)
	72	if span_name == anchor_name:
	73	if debug and not print_result: pywikibot.stdout(' Found section in a span!')
	74	found_section = True
	75	break
	76	if found_section == False:
	77	# Search for a div with this ID
	78	for span_tag in soup.findAll('div'):
	79	span_name = span_tag.get('id', None)
	80	if span_name == anchor_name:
	81	if debug and not print_result: pywikibot.stdout(' Found section in a div!')
	82	found_section = True
	83	break
	84	if found_section == False:
	85	possibly_print(page_name)
[1186]	86	pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
[1185]	87	errors_issued += 1
	88	elif debug and print_result:
	89	pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
	90
	91	# For a link that redirected us to another page, extract the name of the target page from
	92	# the target page's source
	93	def find_canonical_link(page_text, page_name, page_slug):
	94	# Extract link from this markup which contains name of redirected-to page:
	95	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
	96	# "wgPageName":"Namespace:Page_name",
	97	canonical_name = page_text.split('"wgPageName":"')[-1]
	98	tag_end = canonical_name.find('",')
	99
	100	if tag_end == -1:
	101	pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
	102	errors_issued = errors_issued + 1
	103	else:
	104	canonical_name = canonical_name[:tag_end]
	105	if len(canonical_name) > 100:
	106	# Certain things can cause the trim to fail; report error and avoid slamming the
	107	# output with massive page source from a failed trim
	108	pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
	109	errors_issued = errors_issued + 1
	110	else:
	111	canonical_name = canonical_name.replace('_', ' ')
	112	if '#' in page_slug:
	113	_, anchor_name = page_slug.split('#')
	114	if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
	115	find_section(page_text, page_name, page_slug, True)
	116	else:
	117	pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
	118
	119	# Test an intrawiki link and look for a section link if applicable
	120	def test_intrawiki_link(iw_url, page_name, page_slug):
	121	global advice_issued
	122	global errors_issued
	123
	124	response = fetch(iw_url)
	125
	126	# One way we tell that a redirect occurred is by checking fetch's history, as it
	127	# automatically follows redirects. This will catch formal redirects which come from pages
	128	# such as Special:PermanentLink.
	129	if response.history != []:
	130
	131	permalink1 = 'Special:PermanentLink/'.lower()
	132	permalink2 = 'Special:Permalink/'.lower()
	133	page_slug_lower = page_slug.lower()
	134	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
	135	if debug:
	136	possibly_print(page_name)
	137	pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
	138	find_canonical_link(response.text, page_name, page_slug)
	139	else:
	140	possibly_print(page_name)
	141	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
	142	advice_issued += 1
	143	elif response.status_code != 200:
	144	possibly_print(page_name)
	145	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
	146	errors_issued += 1
	147	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
	148	# using JavaScript, while returning code OK 200 as if the link was correct; this happens
	149	# when a redirect page is accessed. We must detect these soft redirects by looking at the
	150	# page source to find the redirect note inserted at the top of the page for the reader.
	151	elif 'Redirected from <a' in response.text:
	152	if debug:
	153	possibly_print(page_name)
	154	pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
	155	find_canonical_link(response.text, page_name, page_slug)
	156	else: # URL is OK, so proceed
	157	find_section(response.text, page_name, page_slug, False)
	158
[1169]	159	# Searches the given page text for intrawiki links with section links in them
[1179]	160	def scan_for_intrawiki_links(page_text, page_name):
	161	global debug
	162	global pages_checked
	163	global iw_found
	164	global advice_issued
	165	global errors_issued
[1185]	166	global name_printed
[1179]	167	pages_checked += 1
	168	name_printed = 0
[1169]	169
[1179]	170	for i, the_pattern in enumerate(link_patterns):
	171	if debug:
	172	if i == 0:
	173	pywikibot.stdout(' Checking page for wikilinks with section names.')
	174	elif i == 1:
	175	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
	176
	177	for match in re.finditer(the_pattern, page_text):
	178	found_iw_match = False
	179	iw_url = ""
	180	page_name2 = page_name
	181
	182	# Cut out the matched text from the page, isolating just the page+section name
	183	target_start = 2 # "[["
	184	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
	185	if i == 1:
	186	target_start = 14 # "{{SectionLink\|"
	187	target_end = 2 # "}}"
	188	s = match.start() + target_start # remove the link-opening markup
	189	e = match.end() - target_end # remove the link-ending markup
[1185]	190	page_slug = page_text[s:e]
[1179]	191
	192	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
	193	if i == 1:
[1185]	194	page_slug = page_slug.replace('\|', '#')
[1169]	195
[1179]	196	# Sometimes we use a space char. instead of a '_', so fix that before querying
[1185]	197	page_slug = page_slug.replace(' ', '_')
	198	if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
[1179]	199
	200	# If this link doesn't have a section link in it, then we don't care about it, as
	201	# MediaWiki takes care of checking basic intrawiki links
[1185]	202	if not '#' in page_slug:
[1179]	203	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
[1169]	204	continue
[1176]	205
[1179]	206	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
	207	# for the task of checking interwiki page+section links
	208	is_interwiki = False
	209	if found_iw_match == False:
[1176]	210	for prefix in interwiki_prefixes:
[1185]	211	if prefix + ":" in page_slug:
	212	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
[1179]	213	is_interwiki = True
	214	break
	215	if is_interwiki:
[1176]	216	continue
[1179]	217
[1185]	218	# If there is a '{' in the link, then probably it's a link built on transcluded text.
	219	# If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
	220	# our "chapter_names" array. If it's another type of transclusion, punt it to the user.
	221	if '{' in page_slug:
[1179]	222	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
[1185]	223	ch_link = ch_link_pattern.search(page_slug)
[1179]	224	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
	225	if ch_link:
	226	ch_link_match = ch_link.group(0)
	227	ch_num_pattern = re.compile("[0-9]+")
	228	ch_num = ch_num_pattern.search(ch_link_match)
	229	if ch_num:
	230	ch_num_match = int(ch_num.group(0))
	231	if ch_num_match >= 0 and ch_num_match <= 14:
	232	ch_name = chapter_names[ch_num_match]
	233	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
[1185]	234	page_slug = replace_pattern.sub(ch_name, page_slug)
	235	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
[1179]	236	else:
[1185]	237	possibly_print(page_name)
	238	pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
	239	errors_issued += 1
[1179]	240	continue
	241	else:
[1185]	242	possibly_print(page_name)
	243	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
[1179]	244	advice_issued += 1
	245	continue
	246	else:
[1185]	247	possibly_print(page_name)
	248	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
[1179]	249	advice_issued += 1
	250	continue
[1171]	251
[1179]	252	# If this is a relative "/" link, use the current page as the basis for the URL. Note
	253	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
	254	# we're out of luck.
[1185]	255	if page_slug.startswith('/'):
	256	page_slug = page_name + page_slug
	257	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
[1179]	258
	259	# If this is a relative "../" link, find the parent page, set ourselves to that page,
	260	# then remove the relative portion of the link. Note that this is only performed once,
	261	# so if there's multiple steps back ("../../"), we're out of luck.
[1185]	262	if page_slug.startswith('../'):
[1169]	263	last_slash = page_name.rfind('/')
	264	page_name2 = page_name[0:last_slash]
[1179]	265	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
[1185]	266	page_slug = page_slug[3:len(page_slug)]
	267	if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
[1171]	268	# If this is now going to be a bare section link for the parent page, don't add a
	269	# slash, otherwise do because we are drilling down to another subpage
[1185]	270	if page_slug.startswith('#'):
	271	page_slug = page_name2 + page_slug
[1169]	272	else:
[1185]	273	page_slug = page_name2 + '/' + page_slug
[1179]	274
	275	# If this is a bare section link, build URL based on this page
[1185]	276	if page_slug.startswith('#'):
[1169]	277	iw_url = onigalore_url + page_name2
[1179]	278	iw_found += 1
[1185]	279	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
[1169]	280	found_iw_match = True
[1185]	281	page_slug = page_name2 + page_slug
[1179]	282
	283	# If there's no ":" in the link (before the section link, where a colon would just be
	284	# part of the text) then it's a Main namespace article; proceed with building URL
	285	if found_iw_match == False:
[1185]	286	if not re.search(":.*#", page_slug):
	287	iw_url = onigalore_url + page_slug
[1179]	288	iw_found += 1
	289	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
	290	found_iw_match = True
	291
	292	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
	293	# before building URL
	294	if found_iw_match == False:
[1169]	295	for prefix in intrawiki_prefixes:
[1185]	296	if prefix + ":" in page_slug:
	297	iw_url = onigalore_url + page_slug
[1179]	298	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
	299	iw_found += 1
	300	found_iw_match = True
	301	break
	302
	303	# If we still haven't turned this match into a URL, something's gone wrong
	304	if (found_iw_match == False) or (iw_url == ""):
[1185]	305	possibly_print(page_name)
	306	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
[1173]	307	continue
[1169]	308
[1179]	309	# Test the URL
	310	iw_url = iw_url.replace(' ', '_')
[1185]	311	if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
	312	test_intrawiki_link(iw_url, page_name, page_slug)
[1169]	313
[1185]	314	# Print a wrap-up message
	315	def print_summary():
[1179]	316	global pages_checked
	317	global iw_found
	318	global advice_issued
	319	global errors_issued
[1185]	320
	321	page_str = "pages"
	322	if pages_checked == 1:
	323	page_str = "page"
	324
	325	link_str = "links"
	326	if iw_found == 1:
	327	link_str = "link"
	328
	329	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
	330	pywikibot.stdout('While attempting to follow section links….')
	331
	332	if advice_issued == 0:
	333	pywikibot.stdout(' No advice on potential problems was issued.')
	334	elif advice_issued == 1:
	335	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
	336	else:
	337	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
	338
	339	error_str = "errors were"
	340	if errors_issued == 1:
	341	error_str = "error was"
	342	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
	343
	344	# Main function
	345	def main(*args):
	346	global debug
[1179]	347	search_cat = ''
	348	search_page = ''
[1169]	349
[1185]	350	# Process arguments
[1179]	351	local_args = pywikibot.handle_args(args)
	352	for arg in local_args:
	353	if arg.startswith('-cat:'):
	354	search_cat = arg[5:]
	355	elif arg.startswith('-page:'):
	356	search_page = arg[6:]
	357	elif arg == '-dbg':
	358	debug = 1
	359	else:
	360	pywikibot.stdout('Unknown argument "{}".'.format(arg))
	361	return
[1169]	362
[1179]	363	site = pywikibot.Site()
[1169]	364
[1179]	365	# This line of code enumerates the methods in the 'page' class
	366	#pywikibot.stdout(format(dir(page)))
[1169]	367
[1185]	368	# Check specified page or loop through specified category and check all pages
[1179]	369	if search_cat != '':
	370	cat_obj = pywikibot.Category(site, search_cat)
	371	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
	372	for page in pagegenerators.PreloadingGenerator(generator, 100):
	373	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
	374	scan_for_intrawiki_links(page.text, page.title())
	375	elif search_page != '':
	376	page = pywikibot.Page(site, search_page)
	377	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
	378	scan_for_intrawiki_links(page.text, page.title())
[1169]	379
[1185]	380	# Print the results
	381	print_summary()
[1169]	382
	383	if __name__ == '__main__':
[1179]	384	main()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: