Changeset 1174


Ignore:
Timestamp:
Jun 29, 2022, 12:11:41 AM (2 years ago)
Author:
iritscen
Message:

ValBot: check_interwiki_links.py now properly detects and reports links leading to redirect pages on other wikis.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • ValBot/Python/check_interwiki_links.py

    r1170 r1174  
    1414import pywikibot
    1515import re
    16 import requests
     16import requests # for listing members with dir()
    1717
    1818from pywikibot.bot import QuitKeyboardInterrupt
     
    5151            page_title = page_text[s:e].replace(' ', '_')
    5252
     53            # Use only spaces for title when printing it
     54            page_title_human = page_title.replace('_', ' ')
     55            pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title_human))
     56            iw_found = iw_found + 1
     57
    5358            # Construct full URL for the particular wiki
    5459            iw_url = interwiki_urls[cur] + page_title
    55             pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title))
    56             iw_found = iw_found + 1
    5760
    5861            # Adjust URL if this is a foreign-language WP link
     
    6568
    6669            # Test the URL
    67             #pywikibot.stdout('   Testing URL "{}"'.format(iw_url))
    6870            response = fetch(iw_url)
    6971
    70             # Redirects are followed automatically by fetch() and treated as "200"s, so the
    71             # way we tell that a redirect occurred is by checking the history
     72            # One way we tell that a redirect occurred is by checking the history
    7273            if response.history != []:
    7374                pywikibot.stdout('   ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
     
    7576            elif response.status_code != 200:
    7677                pywikibot.stdout('   ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
     78                errors_issued = errors_issued + 1
     79            # The usual way that a redirect occurs is that MediaWiki redirects us sneakily
     80            # using JavaScript, while returning code OK 200 as if the link was correct; we
     81            # must detect this from the page source
     82            elif 'Redirected from <a' in response.text:
     83                # Extract link from this source which contains name of redirected-to page:
     84                # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
     85                canonical_name = response.text.split('<link rel="canonical" href="')[-1]
     86                prefix_length = len(interwiki_urls[cur])
     87                canonical_name = canonical_name[prefix_length:]
     88                tag_end = canonical_name.find('"/>')
     89                if tag_end == -1:
     90                   pywikibot.stdout('   ERROR: This is a redirect page (but I could not isolate the correct page name).')
     91                else:
     92                   canonical_name = canonical_name[:tag_end]
     93                   if len(canonical_name) > 100:
     94                      # Certain things can cause the trim to fail; here we avoid slamming
     95                      # the output with massive page source from a failed trim
     96                      pywikibot.stdout('   ERROR: This is a redirect to "{}" (string trimmed to 100 chars due to excessive length).'.format(canonical_name[:100]))
     97                   else:
     98                      canonical_name = canonical_name.replace('_', ' ')
     99                      pywikibot.stdout('   ERROR: This is a redirect to "{}".'.format(canonical_name))
    77100                errors_issued = errors_issued + 1
    78101            elif '#' in page_title:
     
    92115                    span_name = span_tag.get('id', None)
    93116                    if span_name == anchor_name:
    94                         #pywikibot.stdout('Found section!')
    95117                        found_section = True
    96118                        break
Note: See TracChangeset for help on using the changeset viewer.