Changeset 1185 for ValBot/Python/check_intrawiki_section_links.py
- Timestamp:
- Aug 15, 2023, 4:03:16 AM (15 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_intrawiki_section_links.py
r1179 r1185 27 27 onigalore_url = 'https://wiki.oni2.net/' 28 28 29 # Tuple of interwiki prefixes, for passing over such links29 # Tuple of interwiki prefixes, for recognizing and passing over such links 30 30 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp') 31 31 … … 44 44 advice_issued = 0 45 45 errors_issued = 0 46 name_printed = 0 47 48 # Prints the name of a page on which something occurred, if it has not been printed before 49 def possibly_print(page_name): 50 global debug 51 global name_printed 52 53 if not name_printed and not debug: 54 pywikibot.stdout('') 55 pywikibot.stdout('From page "{}":'.format(page_name)) 56 name_printed = 1 57 58 # Search a page for the section specified in the link 59 def find_section(page_text, page_name, page_slug, print_result): 60 global errors_issued 61 62 # Isolate section link 63 target_page_name, anchor_name = page_slug.split('#', 1) 64 target_page_name_human = target_page_name.replace('_', ' ') 65 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name)) 66 67 # Convert slash character to the dot-notation hex encoding that MediaWiki uses 68 anchor_name = anchor_name.replace('/', '.2F') 69 70 # Read linked page to see if it really has this anchor link 71 soup = BeautifulSoup(page_text, 'html.parser') 72 found_section = False 73 for span_tag in soup.findAll('span'): 74 span_name = span_tag.get('id', None) 75 if span_name == anchor_name: 76 if debug and not print_result: pywikibot.stdout(' Found section in a span!') 77 found_section = True 78 break 79 if found_section == False: 80 # Search for a div with this ID 81 for span_tag in soup.findAll('div'): 82 span_name = span_tag.get('id', None) 83 if span_name == anchor_name: 84 if debug and not print_result: pywikibot.stdout(' Found section in a div!') 85 found_section = True 86 break 87 if found_section == False: 88 possibly_print(page_name) 89 pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, pre_section)) 90 errors_issued += 1 91 elif debug and print_result: 92 pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human)) 93 94 # For a link that redirected us to another page, extract the name of the target page from 95 # the target page's source 96 def find_canonical_link(page_text, page_name, page_slug): 97 # Extract link from this markup which contains name of redirected-to page: 98 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> 99 # "wgPageName":"Namespace:Page_name", 100 canonical_name = page_text.split('"wgPageName":"')[-1] 101 tag_end = canonical_name.find('",') 102 103 if tag_end == -1: 104 pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug)) 105 errors_issued = errors_issued + 1 106 else: 107 canonical_name = canonical_name[:tag_end] 108 if len(canonical_name) > 100: 109 # Certain things can cause the trim to fail; report error and avoid slamming the 110 # output with massive page source from a failed trim 111 pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100])) 112 errors_issued = errors_issued + 1 113 else: 114 canonical_name = canonical_name.replace('_', ' ') 115 if '#' in page_slug: 116 _, anchor_name = page_slug.split('#') 117 if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name)) 118 find_section(page_text, page_name, page_slug, True) 119 else: 120 pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name)) 121 122 # Test an intrawiki link and look for a section link if applicable 123 def test_intrawiki_link(iw_url, page_name, page_slug): 124 global advice_issued 125 global errors_issued 126 127 response = fetch(iw_url) 128 129 # One way we tell that a redirect occurred is by checking fetch's history, as it 130 # automatically follows redirects. This will catch formal redirects which come from pages 131 # such as Special:PermanentLink. 132 if response.history != []: 133 134 permalink1 = 'Special:PermanentLink/'.lower() 135 permalink2 = 'Special:Permalink/'.lower() 136 page_slug_lower = page_slug.lower() 137 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2): 138 if debug: 139 possibly_print(page_name) 140 pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug)) 141 find_canonical_link(response.text, page_name, page_slug) 142 else: 143 possibly_print(page_name) 144 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug)) 145 advice_issued += 1 146 elif response.status_code != 200: 147 possibly_print(page_name) 148 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url)) 149 errors_issued += 1 150 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily 151 # using JavaScript, while returning code OK 200 as if the link was correct; this happens 152 # when a redirect page is accessed. We must detect these soft redirects by looking at the 153 # page source to find the redirect note inserted at the top of the page for the reader. 154 elif 'Redirected from <a' in response.text: 155 if debug: 156 possibly_print(page_name) 157 pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug)) 158 find_canonical_link(response.text, page_name, page_slug) 159 else: # URL is OK, so proceed 160 find_section(response.text, page_name, page_slug, False) 46 161 47 162 # Searches the given page text for intrawiki links with section links in them … … 52 167 global advice_issued 53 168 global errors_issued 169 global name_printed 54 170 pages_checked += 1 55 171 name_printed = 0 … … 75 191 s = match.start() + target_start # remove the link-opening markup 76 192 e = match.end() - target_end # remove the link-ending markup 77 link_text= page_text[s:e]193 page_slug = page_text[s:e] 78 194 79 195 # The second link type will look like "Page|Section" or "|Section", so fix that pipe 80 196 if i == 1: 81 link_text = link_text.replace('|', '#')197 page_slug = page_slug.replace('|', '#') 82 198 83 199 # Sometimes we use a space char. instead of a '_', so fix that before querying 84 link_text = link_text.replace(' ', '_')85 if debug: pywikibot.stdout(' Found link {0}.'.format( link_text))200 page_slug = page_slug.replace(' ', '_') 201 if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug)) 86 202 87 203 # If this link doesn't have a section link in it, then we don't care about it, as 88 204 # MediaWiki takes care of checking basic intrawiki links 89 if not '#' in link_text:205 if not '#' in page_slug: 90 206 if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.') 91 207 continue … … 96 212 if found_iw_match == False: 97 213 for prefix in interwiki_prefixes: 98 if prefix + ":" in link_text:99 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format( link_text))214 if prefix + ":" in page_slug: 215 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug)) 100 216 is_interwiki = True 101 217 break … … 103 219 continue 104 220 105 # If there is a '{' in the link, then probably it's a link built on transcluded text 106 # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it 107 if '{' in link_text: 221 # If there is a '{' in the link, then probably it's a link built on transcluded text. 222 # If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using 223 # our "chapter_names" array. If it's another type of transclusion, punt it to the user. 224 if '{' in page_slug: 108 225 ch_link_pattern = re.compile(r"{{C[0-9]*}}") 109 ch_link = ch_link_pattern.search( link_text)226 ch_link = ch_link_pattern.search(page_slug) 110 227 if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0))) 111 228 if ch_link: … … 118 235 ch_name = chapter_names[ch_num_match] 119 236 replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}") 120 link_text = replace_pattern.sub(ch_name, link_text)121 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format( link_text))237 page_slug = replace_pattern.sub(ch_name, page_slug) 238 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug)) 122 239 else: 123 if not name_printed and not debug: 124 pywikibot.stdout('From page "{}":'.format(page_name)) 125 name_printed = 1 126 pywikibot.stdout(' ADVICE: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(link_text, ch_num_match)) 127 advice_issued += 1 240 possibly_print(page_name) 241 pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match)) 242 errors_issued += 1 128 243 continue 129 244 else: 130 if not name_printed and not debug: 131 pywikibot.stdout('From page "{}":'.format(page_name)) 132 name_printed = 1 133 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(link_text)) 245 possibly_print(page_name) 246 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug)) 134 247 advice_issued += 1 135 248 continue 136 249 else: 137 if not name_printed and not debug: 138 pywikibot.stdout('From page "{}":'.format(page_name)) 139 name_printed = 1 140 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(link_text, "{{C7}}")) 250 possibly_print(page_name) 251 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}")) 141 252 advice_issued += 1 142 253 continue … … 145 256 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"), 146 257 # we're out of luck. 147 if link_text.startswith('/'):148 link_text = page_name + link_text149 if debug: pywikibot.stdout(' Changed link_text to {} on account of "/".'.format(link_text))258 if page_slug.startswith('/'): 259 page_slug = page_name + page_slug 260 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug)) 150 261 151 262 # If this is a relative "../" link, find the parent page, set ourselves to that page, 152 263 # then remove the relative portion of the link. Note that this is only performed once, 153 264 # so if there's multiple steps back ("../../"), we're out of luck. 154 if link_text.startswith('../'):265 if page_slug.startswith('../'): 155 266 last_slash = page_name.rfind('/') 156 267 page_name2 = page_name[0:last_slash] 157 268 if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2)) 158 link_text = link_text[3:len(link_text)]159 if debug: pywikibot.stdout(' Changed link_text to {} on account of "../".'.format(link_text))269 page_slug = page_slug[3:len(page_slug)] 270 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug)) 160 271 # If this is now going to be a bare section link for the parent page, don't add a 161 272 # slash, otherwise do because we are drilling down to another subpage 162 if link_text.startswith('#'):163 link_text = page_name2 + link_text273 if page_slug.startswith('#'): 274 page_slug = page_name2 + page_slug 164 275 else: 165 link_text = page_name2 + '/' + link_text276 page_slug = page_name2 + '/' + page_slug 166 277 167 278 # If this is a bare section link, build URL based on this page 168 if link_text.startswith('#'):279 if page_slug.startswith('#'): 169 280 iw_url = onigalore_url + page_name2 170 281 iw_found += 1 171 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format( link_text))282 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug)) 172 283 found_iw_match = True 173 link_text = page_name2 + link_text284 page_slug = page_name2 + page_slug 174 285 175 286 # If there's no ":" in the link (before the section link, where a colon would just be 176 287 # part of the text) then it's a Main namespace article; proceed with building URL 177 288 if found_iw_match == False: 178 if not re.search(":.*#", link_text):179 iw_url = onigalore_url + link_text289 if not re.search(":.*#", page_slug): 290 iw_url = onigalore_url + page_slug 180 291 iw_found += 1 181 292 if debug: pywikibot.stdout(' Link is to a Main namespace page.') … … 186 297 if found_iw_match == False: 187 298 for prefix in intrawiki_prefixes: 188 if prefix + ":" in link_text:189 iw_url = onigalore_url + link_text299 if prefix + ":" in page_slug: 300 iw_url = onigalore_url + page_slug 190 301 if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix)) 191 302 iw_found += 1 … … 195 306 # If we still haven't turned this match into a URL, something's gone wrong 196 307 if (found_iw_match == False) or (iw_url == ""): 197 if not name_printed and not debug: 198 pywikibot.stdout('From page "{}":'.format(page_name)) 199 name_printed = 1 200 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(link_text)) 308 possibly_print(page_name) 309 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug)) 201 310 continue 202 311 203 312 # Test the URL 204 313 iw_url = iw_url.replace(' ', '_') 205 if debug: pywikibot.stdout(' Reading page at {}...'.format(iw_url)) 206 response = fetch(iw_url) 207 208 # Redirects are followed automatically by fetch() and treated as "200"s; the way we can 209 # tell that a redirect occurred is by checking fetch's history 210 if response.history != []: 211 if not name_printed and not debug: 212 pywikibot.stdout('From page "{}":'.format(page_name)) 213 name_printed = 1 214 pywikibot.stdout(' ADVICE: Got redirection code ({0}) on URL "{1}". You should check the link manually.'.format(response.history[0], iw_url)) 215 advice_issued += 1 216 elif response.status_code != 200: 217 if not name_printed and not debug: 218 pywikibot.stdout('From page "{}":'.format(page_name)) 219 name_printed = 1 220 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url)) 221 errors_issued += 1 222 else: 223 # Isolate section link 224 pre_section, section_name = link_text.split('#', 1) 225 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(section_name)) 226 227 # Convert slash character to the dot-notation hex encoding that MediaWiki uses 228 section_name = section_name.replace('/', '.2F') 229 230 # Read linked page to see if it really has this anchor link 231 soup = BeautifulSoup(response.text, 'html.parser') 232 found_section = False 233 for span_tag in soup.findAll('span'): 234 span_name = span_tag.get('id', None) 235 if span_name == section_name: 236 if debug: pywikibot.stdout(' Found section!') 237 found_section = True 238 break 239 if found_section == False: 240 if not name_printed and not debug: 241 pywikibot.stdout('From page "{}":'.format(page_name)) 242 name_printed = 1 243 pywikibot.stdout(' ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section)) 244 errors_issued += 1 245 246 def main(*args): 247 global debug 314 if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url)) 315 test_intrawiki_link(iw_url, page_name, page_slug) 316 317 # Print a wrap-up message 318 def print_summary(): 248 319 global pages_checked 249 320 global iw_found 250 321 global advice_issued 251 322 global errors_issued 323 324 page_str = "pages" 325 if pages_checked == 1: 326 page_str = "page" 327 328 link_str = "links" 329 if iw_found == 1: 330 link_str = "link" 331 332 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 333 pywikibot.stdout('While attempting to follow section links….') 334 335 if advice_issued == 0: 336 pywikibot.stdout(' No advice on potential problems was issued.') 337 elif advice_issued == 1: 338 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.') 339 else: 340 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued)) 341 342 error_str = "errors were" 343 if errors_issued == 1: 344 error_str = "error was" 345 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str)) 346 347 # Main function 348 def main(*args): 349 global debug 252 350 search_cat = '' 253 351 search_page = '' 254 352 353 # Process arguments 255 354 local_args = pywikibot.handle_args(args) 256 genFactory = pagegenerators.GeneratorFactory()257 258 355 for arg in local_args: 259 356 if arg.startswith('-cat:'): … … 272 369 #pywikibot.stdout(format(dir(page))) 273 370 371 # Check specified page or loop through specified category and check all pages 274 372 if search_cat != '': 275 373 cat_obj = pywikibot.Category(site, search_cat) … … 283 381 scan_for_intrawiki_links(page.text, page.title()) 284 382 285 page_str = "pages" 286 if pages_checked == 1: 287 page_str = "page" 288 289 link_str = "links" 290 if iw_found == 1: 291 link_str = "link" 292 293 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 294 pywikibot.stdout('While attempting to follow section links...') 295 296 if advice_issued == 0: 297 pywikibot.stdout(' No advice on potential problems was issued.') 298 elif advice_issued == 1: 299 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.') 300 else: 301 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued)) 302 303 error_str = "errors were" 304 if errors_issued == 1: 305 error_str = "error was" 306 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str)) 383 # Print the results 384 print_summary() 307 385 308 386 if __name__ == '__main__':
Note:
See TracChangeset
for help on using the changeset viewer.