Changeset 1180


Ignore:
Timestamp:
Apr 28, 2023, 2:54:21 AM (20 months ago)
Author:
iritscen
Message:

ValBot: check_interwiki_links.py: Improved output and placed some output under a "-dbg" argument. Added recognition of "WP:" shortcut links.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • ValBot/Python/check_interwiki_links.py

    r1174 r1180  
    2828interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
    2929
     30# Initialize globals
     31debug = 0
    3032pages_checked = 0
    3133iw_found = 0
     
    3335
    3436# Searches the given page text for interwiki links
    35 def scan_for_iw_links(page_text):
    36     global pages_checked
    37     global iw_found
    38     global errors_issued
    39     pages_checked = pages_checked + 1
    40     cur = 0
    41 
    42     for prefix in interwiki_prefixes:
    43         # Isolate strings that start with "[[prefix:" and end with "|" or "]"
    44         iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])"
    45         for match in re.finditer(iw_link, page_text):
    46             # Extract just the page title from this regex match
    47             s = match.start() + 2 + len(prefix) + 1
    48             e = match.end() - 1
    49 
    50             # Sometimes we used a space char. instead of a '_', so fix that before querying
    51             page_title = page_text[s:e].replace(' ', '_')
    52 
    53             # Use only spaces for title when printing it
    54             page_title_human = page_title.replace('_', ' ')
    55             pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title_human))
    56             iw_found = iw_found + 1
    57 
    58             # Construct full URL for the particular wiki
    59             iw_url = interwiki_urls[cur] + page_title
    60 
    61             # Adjust URL if this is a foreign-language WP link
    62             if re.match("^[a-zA-Z]{2}:", page_title):
    63                 lang_code = page_title[0:2] + "."
    64                 # "wp:" is the Wikipedia: namespace, not a language
    65                 if lang_code != "wp." and lang_code != "WP.":
    66                     iw_url = iw_url.replace('en.', lang_code)
    67                     iw_url = iw_url.replace(page_title[0:3], '')
    68 
    69             # Test the URL
    70             response = fetch(iw_url)
    71 
    72             # One way we tell that a redirect occurred is by checking the history
    73             if response.history != []:
    74                 pywikibot.stdout('   ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
    75                 errors_issued = errors_issued + 1
    76             elif response.status_code != 200:
    77                 pywikibot.stdout('   ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
    78                 errors_issued = errors_issued + 1
    79             # The usual way that a redirect occurs is that MediaWiki redirects us sneakily
    80             # using JavaScript, while returning code OK 200 as if the link was correct; we
    81             # must detect this from the page source
    82             elif 'Redirected from <a' in response.text:
    83                 # Extract link from this source which contains name of redirected-to page:
    84                 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
    85                 canonical_name = response.text.split('<link rel="canonical" href="')[-1]
    86                 prefix_length = len(interwiki_urls[cur])
    87                 canonical_name = canonical_name[prefix_length:]
    88                 tag_end = canonical_name.find('"/>')
    89                 if tag_end == -1:
    90                    pywikibot.stdout('   ERROR: This is a redirect page (but I could not isolate the correct page name).')
    91                 else:
    92                    canonical_name = canonical_name[:tag_end]
    93                    if len(canonical_name) > 100:
    94                       # Certain things can cause the trim to fail; here we avoid slamming
    95                       # the output with massive page source from a failed trim
    96                       pywikibot.stdout('   ERROR: This is a redirect to "{}" (string trimmed to 100 chars due to excessive length).'.format(canonical_name[:100]))
    97                    else:
    98                       canonical_name = canonical_name.replace('_', ' ')
    99                       pywikibot.stdout('   ERROR: This is a redirect to "{}".'.format(canonical_name))
    100                 errors_issued = errors_issued + 1
    101             elif '#' in page_title:
    102                 # Isolate section link
    103                 page_name, anchor_name = page_title.split('#')
    104                
    105                 # Convert dot-notation hex entities to proper characters
    106                 anchor_name = anchor_name.replace('.22', '"')
    107                 anchor_name = anchor_name.replace('.27', '\'')
    108                 anchor_name = anchor_name.replace('.28', '(')
    109                 anchor_name = anchor_name.replace('.29', ')')
    110                
    111                 # Read linked page to see if it really has this anchor link
    112                 soup = BeautifulSoup(response.text, 'html.parser')
    113                 found_section = False
    114                 for span_tag in soup.findAll('span'):
    115                     span_name = span_tag.get('id', None)
    116                     if span_name == anchor_name:
    117                         found_section = True
    118                         break
    119                 if found_section == False:
    120                     pywikibot.stdout('   ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name))
    121                     errors_issued = errors_issued + 1
    122         cur = cur + 1
     37def scan_for_interwiki_links(page_text, page_name):
     38   global debug
     39   global pages_checked
     40   global iw_found
     41   global errors_issued
     42   pages_checked = pages_checked + 1
     43   cur = 0
     44   name_printed = 0
     45
     46   for prefix in interwiki_prefixes:
     47      # Isolate strings that start with "[[prefix:" and end with "|" or "]"
     48      iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])"
     49      for match in re.finditer(iw_link, page_text):
     50         # Extract just the page title from this regex match
     51         s = match.start() + 2 + len(prefix) + 1
     52         e = match.end() - 1
     53
     54         # Sometimes we used a space char. instead of a '_', so fix that before querying
     55         page_title = page_text[s:e].replace(' ', '_')
     56
     57         # Use only spaces for title when printing it
     58         page_title_human = page_title.replace('_', ' ')
     59         if debug: pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title_human))
     60         iw_found = iw_found + 1
     61
     62         # Construct full URL for the particular wiki
     63         iw_url = interwiki_urls[cur] + page_title
     64
     65         # Adjust URL if this is a foreign-language WP link
     66         if re.match("^[a-zA-Z]{2}:", page_title):
     67            lang_code = page_title[0:2] + "."
     68            # "wp:" is the Wikipedia: namespace, not a language
     69            if lang_code != "wp." and lang_code != "WP.":
     70               iw_url = iw_url.replace('en.', lang_code)
     71               iw_url = iw_url.replace(page_title[0:3], '')
     72
     73         # Test the URL
     74         response = fetch(iw_url)
     75
     76         # One way we tell that a redirect occurred is by checking the history
     77         if response.history != []:
     78            if not name_printed and not debug:
     79               pywikibot.stdout('From page "{}":'.format(page_name))
     80               name_printed = 1
     81            if page_title.startswith('WP:') and page_title == page_title.upper():
     82               pywikibot.stdout('   ERROR: Got redirection code ({0}) for {1} link "{2}", but this appears to be a deliberate use of a Wikipedia shortcut. You should check the link manually.'.format(response.history[0], prefix, page_title))
     83            else:
     84               pywikibot.stdout('   ERROR: Got redirection code ({0}) for {1} link "{2}". You should check the link manually.'.format(response.history[0], prefix, page_title))
     85            errors_issued = errors_issued + 1
     86         elif response.status_code != 200:
     87            if not name_printed and not debug:
     88               pywikibot.stdout('From page "{}":'.format(page_name))
     89               name_printed = 1
     90            pywikibot.stdout('   ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(response.status_code, prefix, page_title))
     91            errors_issued = errors_issued + 1
     92         # The usual way that a redirect occurs is that MediaWiki redirects us sneakily
     93         # using JavaScript, while returning code OK 200 as if the link was correct; we
     94         # must detect this from the page source
     95         elif 'Redirected from <a' in response.text:
     96            if not name_printed and not debug:
     97               pywikibot.stdout('From page "{}":'.format(page_name))
     98               name_printed = 1
     99            # Extract link from this source which contains name of redirected-to page:
     100            # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
     101            canonical_name = response.text.split('<link rel="canonical" href="')[-1]
     102            prefix_length = len(interwiki_urls[cur])
     103            canonical_name = canonical_name[prefix_length:]
     104            tag_end = canonical_name.find('"/>')
     105            if tag_end == -1:
     106               pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.', format(prefix, page_title))
     107            else:
     108               canonical_name = canonical_name[:tag_end]
     109               if len(canonical_name) > 100:
     110                 # Certain things can cause the trim to fail; here we avoid slamming
     111                 # the output with massive page source from a failed trim
     112                 pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect to "{2}…" (string trimmed to 100 chars).'.format(prefix, page_title, canonical_name[:100]))
     113               else:
     114                 canonical_name = canonical_name.replace('_', ' ')
     115                 pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect to "{2}".'.format(prefix, page_title, canonical_name))
     116            errors_issued = errors_issued + 1
     117         elif '#' in page_title:
     118            # Isolate section link
     119            target_page_name, anchor_name = page_title.split('#')
     120           
     121            # Convert dot-notation hex entities to proper characters
     122            anchor_name = anchor_name.replace('.22', '"')
     123            anchor_name = anchor_name.replace('.27', '\'')
     124            anchor_name = anchor_name.replace('.28', '(')
     125            anchor_name = anchor_name.replace('.29', ')')
     126           
     127            # Read linked page to see if it really has this anchor link
     128            soup = BeautifulSoup(response.text, 'html.parser')
     129            found_section = False
     130            for span_tag in soup.findAll('span'):
     131               span_name = span_tag.get('id', None)
     132               if span_name == anchor_name:
     133                  found_section = True
     134                  break
     135            if found_section == False:
     136               if not name_printed and not debug:
     137                  pywikibot.stdout('From page "{}":'.format(page_name))
     138                  name_printed = 1
     139               target_page_name_human = target_page_name.replace('_', ' ')
     140               pywikibot.stdout('   ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, prefix, target_page_name_human))
     141               errors_issued = errors_issued + 1
     142      cur = cur + 1
    123143
    124144def main(*args):
    125     cat_name = ''
    126     page_name = ''
    127 
    128     local_args = pywikibot.handle_args(args)
    129     genFactory = pagegenerators.GeneratorFactory()
    130 
    131     for arg in local_args:
    132         if arg.startswith('-cat:'):
    133             cat_name = arg[5:]
    134         elif arg.startswith('-page:'):
    135             page_name = arg[6:]
    136 
    137     site = pywikibot.Site()
    138 
    139     #pywikibot.stdout('The members of the requests.models.Response class are:')
    140     #pywikibot.stdout(format(dir(requests.models.Response)))
    141 
    142     if cat_name != '':
    143         cat_obj = pywikibot.Category(site, cat_name)
    144         generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
    145         for page in pagegenerators.PreloadingGenerator(generator, 100):
    146             pywikibot.stdout('Checking page "{}"'.format(page.title()))
    147             scan_for_iw_links(page.text)
    148     elif page_name != '':
    149         page = pywikibot.Page(site, page_name)
    150         pywikibot.stdout('Checking page "{}"'.format(page.title()))
    151         scan_for_iw_links(page.text)
    152 
    153     global pages_checked
    154     global iw_found
    155     global errors_issued
    156 
    157     page_str = "pages"
    158     if pages_checked == 1:
    159         page_str = "page"
    160 
    161     link_str = "links"
    162     if iw_found == 1:
    163         link_str = "link"
    164 
    165     pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
    166 
    167     error_str = "errors were"
    168     if errors_issued == 1:
    169         error_str = "error was"
    170 
    171     pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
     145   global debug
     146   search_cat = ''
     147   search_page = ''
     148
     149   local_args = pywikibot.handle_args(args)
     150   genFactory = pagegenerators.GeneratorFactory()
     151
     152   for arg in local_args:
     153      if arg.startswith('-cat:'):
     154         search_cat = arg[5:]
     155      elif arg.startswith('-page:'):
     156         search_page = arg[6:]
     157      elif arg == '-dbg':
     158         debug = 1
     159      else:
     160         pywikibot.stdout('Unknown argument "{}".'.format(arg))
     161         return
     162
     163   site = pywikibot.Site()
     164
     165   #pywikibot.stdout('The members of the requests.models.Response class are:')
     166   #pywikibot.stdout(format(dir(requests.models.Response)))
     167
     168   if search_cat != '':
     169      cat_obj = pywikibot.Category(site, search_cat)
     170      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
     171      for page in pagegenerators.PreloadingGenerator(generator, 100):
     172         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
     173         scan_for_interwiki_links(page.text, page.title())
     174   elif search_page != '':
     175      page = pywikibot.Page(site, search_page)
     176      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
     177      scan_for_interwiki_links(page.text, page.title())
     178
     179   global pages_checked
     180   global iw_found
     181   global errors_issued
     182
     183   page_str = "pages"
     184   if pages_checked == 1:
     185      page_str = "page"
     186
     187   link_str = "links"
     188   if iw_found == 1:
     189      link_str = "link"
     190
     191   pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
     192
     193   error_str = "errors were"
     194   if errors_issued == 1:
     195      error_str = "error was"
     196
     197   pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
    172198
    173199if __name__ == '__main__':
    174     main()
     200   main()
Note: See TracChangeset for help on using the changeset viewer.