Ignore:
Timestamp:
Mar 21, 2022, 10:22:33 PM (3 years ago)
Author:
iritscen
Message:

ValBot: check_interwiki_links.py: Added code header, improved output, improved section name detection in target page.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • ValBot/Python/check_interwiki_links.py

    r1169 r1170  
     1# Check Interwiki Links
     2# by iritscen@yahoo.com
     3# Looks at each link on a page (or in all the pages in a category) which uses a registered
     4# interwiki prefix and loads the linked page, verifying that it exists and that any section
     5# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
     6# validate the interwiki link.
     7# Recommended viewing width:
     8# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
     9
    110import os
    211
     
    514import pywikibot
    615import re
     16import requests
    717
    818from pywikibot.bot import QuitKeyboardInterrupt
     
    2030pages_checked = 0
    2131iw_found = 0
    22 problems_found = 0
     32errors_issued = 0
    2333
    2434# Searches the given page text for interwiki links
     
    2636    global pages_checked
    2737    global iw_found
    28     global problems_found
     38    global errors_issued
    2939    pages_checked = pages_checked + 1
    3040    cur = 0
     
    4353            # Construct full URL for the particular wiki
    4454            iw_url = interwiki_urls[cur] + page_title
    45             pywikibot.output('Found {0} link {1}.'.format(prefix, page_title))
     55            pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title))
    4656            iw_found = iw_found + 1
    4757
     
    5565
    5666            # Test the URL
    57             #pywikibot.output('Testing URL {}...'.format(iw_url))
     67            #pywikibot.stdout('   Testing URL "{}"'.format(iw_url))
    5868            response = fetch(iw_url)
    5969
     
    6171            # way we tell that a redirect occurred is by checking the history
    6272            if response.history != []:
    63                 pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
    64                 problems_found = problems_found + 1
     73                pywikibot.stdout('   ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
     74                errors_issued = errors_issued + 1
    6575            elif response.status_code != 200:
    66                 #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
    67                 problems_found = problems_found + 1
     76                pywikibot.stdout('   ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
     77                errors_issued = errors_issued + 1
    6878            elif '#' in page_title:
    6979                # Isolate section link
    70                 pywikibot.output('Detected section link on page {0}.'.format(page_title))
    7180                page_name, anchor_name = page_title.split('#')
    7281               
     
    8089                soup = BeautifulSoup(response.text, 'html.parser')
    8190                found_section = False
    82                 for tag in soup.findAll('a'):
    83                     link = tag.get('href', None)
    84                     if not link:
    85                         #pywikibot.output('It is not a link.')
    86                         continue
    87                     #pywikibot.output('Got link {0}.'.format(link))
    88                     if not link.startswith('#'):
    89                         continue
    90                        
    91                     if link == '#' + anchor_name:
    92                         pywikibot.output('Found section link!')
     91                for span_tag in soup.findAll('span'):
     92                    span_name = span_tag.get('id', None)
     93                    if span_name == anchor_name:
     94                        #pywikibot.stdout('Found section!')
    9395                        found_section = True
    9496                        break
    9597                if found_section == False:
    96                     pywikibot.output('Could not find section {0} on page {1}.'.format(anchor_name, page_name))
    97                     problems_found = problems_found + 1
     98                    pywikibot.stdout('   ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name))
     99                    errors_issued = errors_issued + 1
    98100        cur = cur + 1
    99101
     
    113115    site = pywikibot.Site()
    114116
    115     # This line of code enumerates the methods in the 'page' class
    116     #pywikibot.stdout(format(dir(page)))
     117    #pywikibot.stdout('The members of the requests.models.Response class are:')
     118    #pywikibot.stdout(format(dir(requests.models.Response)))
    117119
    118120    if cat_name != '':
     
    120122        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
    121123        for page in pagegenerators.PreloadingGenerator(generator, 100):
    122             pywikibot.stdout('Checking page {0}'.format(page.title()))
     124            pywikibot.stdout('Checking page "{}"'.format(page.title()))
    123125            scan_for_iw_links(page.text)
    124126    elif page_name != '':
    125127        page = pywikibot.Page(site, page_name)
    126         pywikibot.stdout('Checking page {0}'.format(page.title()))
     128        pywikibot.stdout('Checking page "{}"'.format(page.title()))
    127129        scan_for_iw_links(page.text)
    128130
    129131    global pages_checked
    130132    global iw_found
    131     global problems_found
    132     pywikibot.stdout('Checked {0} page(s) and found {1} interwiki link(s) with {2} problem(s).'.format(pages_checked, iw_found, problems_found))
     133    global errors_issued
     134
     135    page_str = "pages"
     136    if pages_checked == 1:
     137        page_str = "page"
     138
     139    link_str = "links"
     140    if iw_found == 1:
     141        link_str = "link"
     142
     143    pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
     144
     145    error_str = "errors were"
     146    if errors_issued == 1:
     147        error_str = "error was"
     148
     149    pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
    133150
    134151if __name__ == '__main__':
Note: See TracChangeset for help on using the changeset viewer.