- Timestamp:
- Sep 21, 2025, 11:50:56 PM (37 hours ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_interwiki_links.py
r1197 r1198 1 1 # Check Interwiki Links 2 2 # by iritscen@yahoo.com 3 # Looks at each link on a page (or all the pages in a category) which uses a registered 4 # interwiki prefix and loads the linked page, verifying that it exists and that any section 5 # link, if present, is valid as well. The output will use the word "ERROR" when it cannot 6 # validate the interwiki link. 3 # Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that 4 # any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link. 7 5 # Recommended viewing width: 8 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --- |6 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----| 9 7 10 8 import bs4 … … 22 20 23 21 class IWLink: 24 def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_ slug, curl_response):22 def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response): 25 23 self.iw_prefix = iw_prefix # e.g. "wp" 26 24 self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/" 27 self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg" 28 self.page_name = page_name # "Easter egg" 29 self.page_slug = page_slug # "Easter_egg" 25 self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy" 26 self.page_name = page_name # "Marathon (series)#Rampancy" 27 self.page_name_only = page_name # "Marathon (series)" 28 self.page_slug = page_slug # "Marathon_(series)#Rampancy" 29 self.hosting_page = hosting_page # "Easter eggs"; page where the link was found 30 30 self.curl_response = curl_response # a class defined in the Requests library 31 31 … … 44 44 45 45 # Prints the name of a page on which something occurred, if it has not been printed before 46 def possibly_print( page_name):46 def possibly_print(the_link): 47 47 global debug 48 48 global name_printed … … 50 50 if not name_printed and not debug: 51 51 pywikibot.stdout('') 52 pywikibot.stdout('From page "{}":'.format( page_name))52 pywikibot.stdout('From page "{}":'.format(the_link.hosting_page)) 53 53 name_printed = 1 54 54 … … 58 58 59 59 # Isolate section link 60 target_page_name, anchor_name = the_link.page_slug.split('#') 61 target_page_name_human = target_page_name.replace('_', ' ') 60 _, anchor_name = the_link.page_slug.split('#') 62 61 63 62 # Convert dot-notation hex entities to proper characters … … 80 79 # Tell user what we found 81 80 if found_section == False: 82 possibly_print(the_link .page_name)83 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, t arget_page_name_human))81 possibly_print(the_link) 82 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name)) 84 83 errors_issued = errors_issued + 1 85 84 elif print_result == True: 86 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human)) 87 88 # For a link that redirected us to another page, extract the name of the target page from 89 # the target page's source 85 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name)) 86 87 # For a link that redirected us to another page, extract the name of the target page from the target page's source 90 88 def find_canonical_link(the_link): 91 89 # Extract link from this markup which contains name of redirected-to page: … … 102 100 canonical_name = canonical_name[:tag_end] 103 101 if len(canonical_name) > 100: 104 # Certain things can cause the trim to fail; report error and avoid slamming the 105 # output with massive page source from a failed trim 102 # Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim 106 103 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100])) 107 104 errors_issued = errors_issued + 1 108 105 else: 109 canonical_name = canonical_name.replace('_', ' ')106 the_link.page_name = canonical_name.replace('_', ' ') 110 107 if '#' in the_link.page_slug: 111 _, anchor_name = the_link.page_slug.split('#') 112 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name)) 113 the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages 108 the_link.page_name_only, _ = the_link.page_slug.split('#') 109 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name)) 114 110 find_section(the_link, True) 115 111 else: 116 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))112 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name)) 117 113 118 114 # Test an interwiki link and look for a section link if applicable … … 123 119 the_link.curl_response = fetch(the_link.full_url) 124 120 125 # One way we tell that a redirect occurred is by checking fetch's history, as it 126 # automatically follows redirects. This will catch formal redirects which come from pages 127 # such as Special:PermanentLink. 121 # One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from 122 # pages such as Special:PermanentLink. 128 123 if the_link.curl_response.history != []: 129 possibly_print(the_link .page_name)124 possibly_print(the_link) 130 125 131 126 # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect … … 144 139 errors_issued = errors_issued + 1 145 140 elif the_link.curl_response.status_code != 200: 146 possibly_print(the_link .page_name)141 possibly_print(the_link) 147 142 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug)) 148 143 errors_issued = errors_issued + 1 149 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily 150 # using JavaScript, while returning code OK 200 as if the link was correct; this happens 151 # when a redirect page is accessed. We must detect these soft redirects by looking at the 152 # page source to find the redirect note inserted at the top of the page for the reader. 144 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this 145 # happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the 146 # page for the reader. 153 147 elif 'Redirected from <a' in the_link.curl_response.text: 154 148 unintended_redirects_found = unintended_redirects_found + 1 155 possibly_print(the_link .page_name)149 possibly_print(the_link) 156 150 pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug)) 157 find_canonical_link(the_link) 151 find_canonical_link(the_link) # calls find_section() at end 158 152 elif '#' in the_link.page_slug: 159 153 find_section(the_link, False) … … 173 167 iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])" 174 168 for match in re.finditer(iw_link, page_text): 175 the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "")169 the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", "", "", "", page_name, "") 176 170 177 171 # Extract just the page title from this regex match … … 179 173 e = match.end() - 1 180 174 181 # Commonly we use spaces instead of underscores, so fix that before querying175 # Use underscores in slug used to constructed URL, but retain spaces for printable name 182 176 the_link.page_slug = page_text[s:e].replace(' ', '_') 183 184 # But use spaces for title when printing it 185 page_title_human = the_link.page_slug.replace('_', ' ') 186 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human)) 177 the_link.page_name = page_text[s:e] 178 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, the_link.page_name)) 187 179 iw_found = iw_found + 1 188 180
Note:
See TracChangeset
for help on using the changeset viewer.