Changeset 1171 for ValBot


Ignore:
Timestamp:
Mar 21, 2022, 10:23:25 PM (3 years ago)
Author:
iritscen
Message:

ValBot: check_intrawiki_section_links.py: Added code header, improved output, added support for relative links to subpages in source page.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • ValBot/Python/check_intrawiki_section_links.py

    r1169 r1171  
     1# Check Intrawiki Section Links
     2# by iritscen@yahoo.com
     3# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
     4# and loads the linked page and verifies that the named section actually exists. The output will
     5# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
     6# Recommended viewing width:
     7# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
     8
    19import os
    210
     
    2432pages_checked = 0
    2533iw_found = 0
    26 problems_found = 0
     34advice_issued = 0
     35warnings_issued = 0
     36errors_issued = 0
    2737page_name = ''
    2838
     
    3141    global pages_checked
    3242    global iw_found
    33     global problems_found
     43    global advice_issued
     44    global warnings_issued
     45    global errors_issued
    3446    global page_name
    3547    pages_checked = pages_checked + 1
     
    5163        # Sometimes we used a space char. instead of a '_', so fix that before querying
    5264        link_text = link_text.replace(' ', '_')
    53         #pywikibot.output('Found link {0}.'.format(link_text))
     65        #pywikibot.stdout('Found link {0}.'.format(link_text))
    5466       
    5567        # If this link doesn't have a section link in it, then we don't care about it, as
    5668        # MediaWiki takes care of checking basic intrawiki links
    5769        if not '#' in link_text:
    58             #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
     70            #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
    5971            continue
    6072       
     
    6274        # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
    6375        if '{' in link_text:
    64             pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
     76            pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
     77            advice_issued = advice_issued + 1
    6578            continue
    66        
    67         # If this is a relative "../" link, find the parent page and set ourselves to that
    68         # page, then remove the relative portion of the link. Note that this is only performed
    69         # once, so if there's multiple steps back ("../../"), we're out of luck.
     79
     80        # If this is a relative "/" link, use the current page as the basis for the URL. Note
     81        # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
     82        # we're out of luck.
     83        if link_text.startswith('/'):
     84            link_text = page_name + link_text
     85            pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
     86       
     87        # If this is a relative "../" link, find the parent page and set ourselves to that page,
     88        # then remove the relative portion of the link. Note that this is only performed once,
     89        # so if there's multiple steps back ("../../"), we're out of luck.
    7090        if link_text.startswith('../'):
    7191            last_slash = page_name.rfind('/')
    7292            page_name2 = page_name[0:last_slash]
    73             #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
     93            #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
    7494            link_text = link_text[3:len(link_text)]
    75             #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
    76             # If this is now going to be a bare section link for the parent page, don't add
    77             # a slash, otherwise do because we are drilling down to another subpage
     95            #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
     96            # If this is now going to be a bare section link for the parent page, don't add a
     97            # slash, otherwise do because we are drilling down to another subpage
    7898            if link_text.startswith('#'):
    7999                link_text = page_name2 + link_text
     
    85105            iw_url = onigalore_url + page_name2
    86106            iw_found = iw_found + 1
    87             #pywikibot.output('Found link to this very page, {}.'.format(link_text))
     107            #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
    88108            found_iw_match = True
    89109            link_text = page_name2 + link_text
     
    91111        # If there's no ":" in the link (before the section link, where a colon would just be
    92112        # part of the text) then it's a Main namespace article, so construct URL
    93         #if not ':' in link_text:
    94113        if found_iw_match == False:
    95114            if not re.search(":.*#", link_text):
    96115                iw_url = onigalore_url + link_text
    97116                iw_found = iw_found + 1
    98                 #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
     117                #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
    99118                found_iw_match = True
    100119           
     
    102121        if found_iw_match == False:
    103122            for prefix in intrawiki_prefixes:
    104                 #pywikibot.output('Comparing link against prefix {}.'.format(prefix))
     123                #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
    105124                if prefix + ":" in link_text:
    106125                    iw_url = onigalore_url + link_text
    107126                    _, post_ns = link_text.split(':', 1)
    108                     #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
     127                    #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
    109128                    iw_found = iw_found + 1
    110129                    found_iw_match = True
     
    117136            for prefix in interwiki_prefixes:
    118137                if prefix + ":" in link_text:
    119                     #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
     138                    #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
    120139                    is_interwiki = True
    121140                    break
     
    125144        # If we still haven't turned this match into a URL, something's gone wrong
    126145        if (found_iw_match == False) or (iw_url == ""):
    127             pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
     146            pywikibot.stdout('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
    128147            quit()
    129148
    130149        # Test the URL
    131150        iw_url = iw_url.replace(' ', '_')
    132         #pywikibot.output('Reading page at {}...'.format(iw_url))
     151        #pywikibot.stdout('Reading page at {}...'.format(iw_url))
    133152        response = fetch(iw_url)
    134153
    135         # Redirects are followed automatically by fetch() and treated as "200"s, so the
    136         # way we tell that a redirect occurred is by checking the history
     154        # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
     155        # tell that a redirect occurred is by checking fetch's history
    137156        if response.history != []:
    138             pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
    139             problems_found = problems_found + 1
     157            pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
     158            warnings_issued = warnings_issued + 1
    140159        elif response.status_code != 200:
    141             #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
    142             problems_found = problems_found + 1
     160            pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
     161            warnings_issued = warnings_issued + 1
    143162        else:
    144163            # Isolate section link
    145164            pre_section, section_name = link_text.split('#', 1)
    146             #pywikibot.output('Searching for section link {} on page.'.format(section_name))
     165            #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
    147166           
    148167            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
     
    155174                span_name = span_tag.get('id', None)
    156175                if span_name == section_name:
    157                     #pywikibot.output('Found section!')
     176                    #pywikibot.stdout('Found section!')
    158177                    found_section = True
    159178                    break
    160179            if found_section == False:
    161                 pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
    162                 problems_found = problems_found + 1
     180                pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
     181                errors_issued = errors_issued + 1
    163182
    164183def main(*args):
     
    194213    global pages_checked
    195214    global iw_found
    196     global problems_found
    197     pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))
     215    global advice_issued
     216    global warnings_issued
     217    global errors_issued
     218
     219    page_str = "pages"
     220    if pages_checked == 1:
     221        page_str = "page"
     222
     223    link_str = "links"
     224    if iw_found == 1:
     225        link_str = "link"
     226
     227    pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
     228    pywikibot.stdout('While attempting to follow section links...')
     229
     230    if advice_issued == 0:
     231        pywikibot.stdout('  No advice on potential problems was issued.')
     232    elif advice_issued == 1:
     233        pywikibot.stdout('  1 piece of advice on a potential problem was issued.')
     234    else:
     235        pywikibot.stdout('  {} pieces of advice on potential problems were issued.'.format(advice_issued))
     236
     237    warning_str = "warnings were"
     238    if warnings_issued == 1:
     239        warning_str = "warning was"
     240    pywikibot.stdout('  {0} {1} issued.'.format(warnings_issued, warning_str))
     241
     242    error_str = "errors were"
     243    if errors_issued == 1:
     244        error_str = "error was"
     245    pywikibot.stdout('  {0} {1} encountered.'.format(errors_issued, error_str))
    198246
    199247if __name__ == '__main__':
Note: See TracChangeset for help on using the changeset viewer.