# Check Interwiki Links # by iritscen@yahoo.com # Looks at each link on a page (or all the pages in a category) which uses a registered # interwiki prefix and loads the linked page, verifying that it exists and that any section # link, if present, is valid as well. The output will use the word "ERROR" when it cannot # validate the interwiki link. # Recommended viewing width: # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| import bs4 import pywikibot import re import requests # for listing members with dir() when debugging from bs4 import BeautifulSoup from pywikibot import pagegenerators from pywikibot.bot import QuitKeyboardInterrupt from pywikibot.comms.http import fetch from pywikibot.specialbots import UploadRobot from pywikibot.tools.formatter import color_format from urllib.parse import urljoin class IWLink: def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response): self.iw_prefix = iw_prefix # e.g. "wp" self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/" self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg" self.page_name = page_name # "Easter egg" self.page_slug = page_slug # "Easter_egg" self.curl_response = curl_response # a class defined in the Requests library # Parallel arrays based on https://wiki.oni2.net/Special:Interwiki interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp') interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/') # Initialize globals debug = 0 pages_checked = 0 iw_found = 0 errors_issued = 0 unintended_redirects_found = 0 name_printed = 0 # Prints the name of a page on which something occurred, if it has not been printed before def possibly_print(page_name): global debug global name_printed if not name_printed and not debug: pywikibot.stdout('') pywikibot.stdout('From page "{}":'.format(page_name)) name_printed = 1 # Search a page for the section specified in the link def find_section(the_link, print_result): global errors_issued # Isolate section link target_page_name, anchor_name = the_link.page_slug.split('#') target_page_name_human = target_page_name.replace('_', ' ') # Convert dot-notation hex entities to proper characters replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')] for pattern, replacement in replacements: anchor_name = re.sub(pattern, replacement, anchor_name) # Read linked page to see if it really has this anchor link soup = BeautifulSoup(the_link.curl_response.text, 'html.parser') tags_to_search = ['span', 'div', 'h2', 'h3', 'h4'] found_section = False for tag_name in tags_to_search: for the_tag in soup.find_all(tag_name): if the_tag.get('id') == anchor_name: found_section = True break if found_section: break # Tell user what we found if found_section == False: possibly_print(the_link.page_name) pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human)) # TODO: Check that page name has been corrected to redirected page if there was a redirect errors_issued = errors_issued + 1 elif print_result == True: pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human)) # For a link that redirected us to another page, extract the name of the target page from # the target page's source def find_canonical_link(the_link): # Extract link from this markup which contains name of redirected-to page: # canonical_name = the_link.curl_response.text.split('') if tag_end == -1: pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug)) errors_issued = errors_issued + 1 else: canonical_name = canonical_name[:tag_end] if len(canonical_name) > 100: # Certain things can cause the trim to fail; report error and avoid slamming the # output with massive page source from a failed trim pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100])) errors_issued = errors_issued + 1 else: canonical_name = canonical_name.replace('_', ' ') if '#' in the_link.page_slug: _, anchor_name = the_link.page_slug.split('#') pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name)) the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages find_section(the_link, True) else: pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name)) # Test an interwiki link and look for a section link if applicable def test_interwiki_link(the_link): global errors_issued global unintended_redirects_found the_link.curl_response = fetch(the_link.full_url) # One way we tell that a redirect occurred is by checking fetch's history, as it # automatically follows redirects. This will catch formal redirects which come from pages # such as Special:PermanentLink. if the_link.curl_response.history != []: possibly_print(the_link.page_name) # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper(): pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug)) find_canonical_link(the_link) else: permalink1 = 'Special:PermanentLink/'.lower() permalink2 = 'Special:Permalink/'.lower() page_slug_lower = the_link.page_slug.lower() if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2): pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug)) find_canonical_link(the_link) else: pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug)) errors_issued = errors_issued + 1 elif the_link.curl_response.status_code != 200: possibly_print(the_link.page_name) pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug)) errors_issued = errors_issued + 1 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily # using JavaScript, while returning code OK 200 as if the link was correct; this happens # when a redirect page is accessed. We must detect these soft redirects by looking at the # page source to find the redirect note inserted at the top of the page for the reader. elif 'Redirected from