Changeset 1196
- Timestamp:
- Aug 15, 2025, 10:55:01 PM (14 hours ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_interwiki_links.py
r1192 r1196 1 1 # Check Interwiki Links 2 2 # by iritscen@yahoo.com 3 # Looks at each link on a page (or inall the pages in a category) which uses a registered3 # Looks at each link on a page (or all the pages in a category) which uses a registered 4 4 # interwiki prefix and loads the linked page, verifying that it exists and that any section 5 5 # link, if present, is valid as well. The output will use the word "ERROR" when it cannot … … 8 8 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| 9 9 10 import os 11 12 from urllib.parse import urljoin 13 10 import bs4 14 11 import pywikibot 15 import bs416 12 import re 17 import requests # for listing members with dir() 18 13 import requests # for listing members with dir() when debugging 14 15 from bs4 import BeautifulSoup 16 from pywikibot import pagegenerators 19 17 from pywikibot.bot import QuitKeyboardInterrupt 20 from pywikibot import pagegenerators21 from pywikibot.tools.formatter import color_format22 18 from pywikibot.comms.http import fetch 23 19 from pywikibot.specialbots import UploadRobot 24 from bs4 import BeautifulSoup 20 from pywikibot.tools.formatter import color_format 21 from urllib.parse import urljoin 22 23 class IWLink: 24 def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response): 25 self.iw_prefix = iw_prefix # e.g. "wp" 26 self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/" 27 self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg" 28 self.page_name = page_name # "Easter egg" 29 self.page_slug = page_slug # "Easter_egg" 30 self.curl_response = curl_response # a class defined in the Requests library 25 31 26 32 # Parallel arrays based on https://wiki.oni2.net/Special:Interwiki … … 34 40 iw_found = 0 35 41 errors_issued = 0 42 unintended_redirects_found = 0 36 43 name_printed = 0 37 44 … … 47 54 48 55 # Search a page for the section specified in the link 49 def find_section( page_text, page_name, page_slug, prefix, print_result):56 def find_section(the_link, print_result): 50 57 global errors_issued 51 58 52 59 # Isolate section link 53 target_page_name, anchor_name = page_slug.split('#')60 target_page_name, anchor_name = the_link.page_slug.split('#') 54 61 target_page_name_human = target_page_name.replace('_', ' ') 55 62 56 63 # Convert dot-notation hex entities to proper characters 57 anchor_name = anchor_name.replace('.22', '"') 58 anchor_name = anchor_name.replace('.27', '\'') 59 anchor_name = anchor_name.replace('.28', '(') 60 anchor_name = anchor_name.replace('.29', ')') 64 replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')] 65 for pattern, replacement in replacements: 66 anchor_name = re.sub(pattern, replacement, anchor_name) 61 67 62 68 # Read linked page to see if it really has this anchor link 63 soup = BeautifulSoup(page_text, 'html.parser') 69 soup = BeautifulSoup(the_link.curl_response.text, 'html.parser') 70 tags_to_search = ['span', 'div', 'h2', 'h3', 'h4'] 64 71 found_section = False 65 for the_tag in soup.findAll('span'): # search for span with ID matching the section name 66 tag_name = the_tag.get('id', None) 67 if tag_name == anchor_name: 68 found_section = True 69 break 72 for tag_name in tags_to_search: 73 for the_tag in soup.find_all(tag_name): 74 if the_tag.get('id') == anchor_name: 75 found_section = True 76 break 77 if found_section: 78 break 79 80 # Tell user what we found 70 81 if found_section == False: 71 for the_tag in soup.findAll('div'): # search for div with ID matching the section name 72 tag_name = the_tag.get('id', None) 73 if tag_name == anchor_name: 74 found_section = True 75 break 76 if found_section == False: 77 for the_tag in soup.findAll('h2'): # search for h2 with ID matching the section name 78 tag_name = the_tag.get('id', None) 79 if tag_name == anchor_name: 80 found_section = True 81 break 82 if found_section == False: 83 for the_tag in soup.findAll('h3'): # search for h3 with ID matching the section name 84 tag_name = the_tag.get('id', None) 85 if tag_name == anchor_name: 86 found_section = True 87 break 88 if found_section == False: 89 for the_tag in soup.findAll('h4'): # search for h4 with ID matching the section name 90 tag_name = the_tag.get('id', None) 91 if tag_name == anchor_name: 92 found_section = True 93 break 94 if found_section == False: 95 possibly_print(page_name) 96 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, prefix, target_page_name_human)) 82 possibly_print(the_link.page_name) 83 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human)) 84 # TODO: Check that page name has been corrected to redirected page if there was a redirect 97 85 errors_issued = errors_issued + 1 98 86 elif print_result == True: 99 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, prefix, target_page_name_human))87 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human)) 100 88 101 89 # For a link that redirected us to another page, extract the name of the target page from 102 90 # the target page's source 103 def find_canonical_link( page_text, page_name, page_slug, prefix, prefix_url):91 def find_canonical_link(the_link): 104 92 # Extract link from this markup which contains name of redirected-to page: 105 93 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> 106 canonical_name = page_text.split('<link rel="canonical" href="')[-1]107 prefix_length = len( prefix_url)94 canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1] 95 prefix_length = len(the_link.prefix_url) 108 96 canonical_name = canonical_name[prefix_length:] 109 97 tag_end = canonical_name.find('">') 110 98 111 99 if tag_end == -1: 112 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format( prefix,page_slug))100 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug)) 113 101 errors_issued = errors_issued + 1 114 102 else: … … 117 105 # Certain things can cause the trim to fail; report error and avoid slamming the 118 106 # output with massive page source from a failed trim 119 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format( prefix,page_slug, canonical_name[:100]))107 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100])) 120 108 errors_issued = errors_issued + 1 121 109 else: 122 110 canonical_name = canonical_name.replace('_', ' ') 123 if '#' in page_slug: 124 _, anchor_name = page_slug.split('#') 125 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking section link….'.format(prefix, page_slug, canonical_name, anchor_name)) 126 find_section(page_text, page_name, page_slug, prefix, True) 111 if '#' in the_link.page_slug: 112 _, anchor_name = the_link.page_slug.split('#') 113 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name)) 114 the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages 115 find_section(the_link, True) 127 116 else: 128 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format( prefix,page_slug, canonical_name))117 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name)) 129 118 130 119 # Test an interwiki link and look for a section link if applicable 131 def test_interwiki_link( prefix, prefix_url, iw_url, page_name, page_slug):120 def test_interwiki_link(the_link): 132 121 global errors_issued 133 134 response = fetch(iw_url) 122 global unintended_redirects_found 123 124 the_link.curl_response = fetch(the_link.full_url) 135 125 136 126 # One way we tell that a redirect occurred is by checking fetch's history, as it 137 127 # automatically follows redirects. This will catch formal redirects which come from pages 138 128 # such as Special:PermanentLink. 139 if response.history != []: 140 possibly_print(page_name) 141 142 if page_slug.startswith('WP:') and page_slug == page_slug.upper(): 143 pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(response.history[0], prefix, page_slug)) 144 find_canonical_link(response.text, page_name, page_slug, prefix, prefix_url) 129 if the_link.curl_response.history != []: 130 possibly_print(the_link.page_name) 131 132 # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect 133 if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper(): 134 pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug)) 135 find_canonical_link(the_link) 145 136 else: 146 137 permalink1 = 'Special:PermanentLink/'.lower() 147 138 permalink2 = 'Special:Permalink/'.lower() 148 page_slug_lower = page_slug.lower()139 page_slug_lower = the_link.page_slug.lower() 149 140 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2): 150 pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format( response.history[0], prefix,page_slug))151 find_canonical_link( response.text, page_name, page_slug, prefix, prefix_url)141 pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug)) 142 find_canonical_link(the_link) 152 143 else: 153 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format( response.history[0], prefix,page_slug))144 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug)) 154 145 errors_issued = errors_issued + 1 155 elif response.status_code != 200:156 possibly_print( page_name)157 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format( response.status_code, prefix,page_slug))146 elif the_link.curl_response.status_code != 200: 147 possibly_print(the_link.page_name) 148 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug)) 158 149 errors_issued = errors_issued + 1 159 150 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily … … 161 152 # when a redirect page is accessed. We must detect these soft redirects by looking at the 162 153 # page source to find the redirect note inserted at the top of the page for the reader. 163 elif 'Redirected from <a' in response.text: 164 possibly_print(page_name) 165 pywikibot.stdout(' Got silently redirected by {0} link "{1}". Checking the target page….'.format(prefix, page_slug)) 166 find_canonical_link(response.text, page_name, page_slug, prefix, prefix_url) 167 elif '#' in page_slug: 168 find_section(response.text, page_name, page_slug, prefix, False) 154 elif 'Redirected from <a' in the_link.curl_response.text: 155 unintended_redirects_found = unintended_redirects_found + 1 156 possibly_print(the_link.page_name) 157 pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug)) 158 find_canonical_link(the_link) 159 elif '#' in the_link.page_slug: 160 find_section(the_link, False) 169 161 170 162 # Searches the given page text for interwiki links … … 182 174 iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])" 183 175 for match in re.finditer(iw_link, page_text): 176 the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "") 177 184 178 # Extract just the page title from this regex match 185 s = match.start() + 2 + len( prefix) + 1179 s = match.start() + 2 + len(the_link.iw_prefix) + 1 186 180 e = match.end() - 1 187 181 188 182 # Commonly we use spaces instead of underscores, so fix that before querying 189 page_slug = page_text[s:e].replace(' ', '_')183 the_link.page_slug = page_text[s:e].replace(' ', '_') 190 184 191 185 # But use spaces for title when printing it 192 page_title_human = page_slug.replace('_', ' ')193 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format( prefix, page_title_human))186 page_title_human = the_link.page_slug.replace('_', ' ') 187 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human)) 194 188 iw_found = iw_found + 1 195 189 196 190 # Construct full URL for the particular wiki 197 iw_url = interwiki_urls[cur_prefix] +page_slug191 the_link.full_url = the_link.prefix_url + the_link.page_slug 198 192 199 193 # Adjust URL if this is a foreign-language WP link 200 if re.match("^[a-zA-Z]{2}:", page_slug):201 lang_code = page_slug[0:2] + "."194 if re.match("^[a-zA-Z]{2}:", the_link.page_slug): 195 lang_code = the_link.page_slug[0:2] + "." 202 196 # "wp:" is the Wikipedia: namespace, not a language 203 197 if lang_code != "wp." and lang_code != "WP.": 204 iw_url = iw_url.replace('en.', lang_code)205 iw_url = iw_url.replace(page_slug[0:3], '')198 the_link.full_url = the_link.full_url.replace('en.', lang_code) 199 the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '') 206 200 207 201 # Test the URL 208 test_interwiki_link( prefix, interwiki_urls[cur_prefix], iw_url, page_name, page_slug)202 test_interwiki_link(the_link) 209 203 cur_prefix = cur_prefix + 1 210 204 … … 214 208 global iw_found 215 209 global errors_issued 210 global unintended_redirects_found 216 211 217 212 page_str = "pages" … … 230 225 231 226 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str)) 227 228 warning_str = "likely-unintended redirects were" 229 if unintended_redirects_found == 1: 230 warning_str = "likely-unintended redirect was" 231 232 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str)) 232 233 233 234 # Main function
Note:
See TracChangeset
for help on using the changeset viewer.