Changeset 1179


Ignore:
Timestamp:
Apr 28, 2023, 2:53:24 AM (20 months ago)
Author:
iritscen
Message:

ValBot: check_intrawiki_section_links.py: Simplified output to just advice and errors. Added support for SectionLink template. Added support for links built on chapter name transclusion. Placed verbose output under a "-dbg" argument.

Location:
ValBot
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • ValBot/Python/check_intrawiki_section_links.py

    r1176 r1179  
    22# by iritscen@yahoo.com
    33# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
    4 # and loads the linked page and verifies that the named section actually exists. The output will
    5 # use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
     4# and loads the linked page and verifies that the named section actually exists. It also
     5# understands section links generated through a call to Template:SectionLink.
    66# Recommended viewing width:
    77# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
     
    2121from bs4 import BeautifulSoup
    2222
    23 # Array of OniGalore's namespaces
     23# Tuple of OniGalore's namespaces
    2424intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
    2525
     
    2727onigalore_url = 'https://wiki.oni2.net/'
    2828
    29 # Interwiki prefixes, for ruling out these links
     29# Tuple of interwiki prefixes, for passing over such links
    3030interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
    3131
     32# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
     33chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
     34
     35# Tuple of patterns for recognizing wikilinks
     36# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any|thing]]", "[[any:thi|ng]]"
     37# Pattern 2: Detect "{{SectionLink|Page|Section name}}", "{{SectionLink||Section name}}"
     38link_patterns = ("\[\[[^|\]]*(\||\])", "\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}")
     39
     40# Initialize globals
     41debug = 0
    3242pages_checked = 0
    3343iw_found = 0
    3444advice_issued = 0
    35 warnings_issued = 0
    3645errors_issued = 0
    37 page_name = ''
    3846
    3947# Searches the given page text for intrawiki links with section links in them
    40 def scan_for_iw_links(page_text):
    41     global pages_checked
    42     global iw_found
    43     global advice_issued
    44     global warnings_issued
    45     global errors_issued
    46     global page_name
    47     pages_checked = pages_checked + 1
    48 
    49     # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
    50     # "[[any:thi|ng]]"
    51     iw_link = "\[\[[^|\]]*(\||\])"
    52     for match in re.finditer(iw_link, page_text):
    53         found_iw_match = False
    54         iw_url = ""
    55         page_name2 = page_name
    56    
    57         # Cut out the matched text from the page, and in the process remove the "[[" from the
    58         # front and the "|" or "]" from the end
    59         s = match.start() + 2
    60         e = match.end() - 1
    61         link_text = page_text[s:e]
    62 
    63         # Sometimes we used a space char. instead of a '_', so fix that before querying
    64         link_text = link_text.replace(' ', '_')
    65         #pywikibot.stdout('Found link {0}.'.format(link_text))
    66        
    67         # If this link doesn't have a section link in it, then we don't care about it, as
    68         # MediaWiki takes care of checking basic intrawiki links
    69         if not '#' in link_text:
    70             #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
     48def scan_for_intrawiki_links(page_text, page_name):
     49   global debug
     50   global pages_checked
     51   global iw_found
     52   global advice_issued
     53   global errors_issued
     54   pages_checked += 1
     55   name_printed = 0
     56
     57   for i, the_pattern in enumerate(link_patterns):
     58      if debug:
     59         if i == 0:
     60            pywikibot.stdout('   Checking page for wikilinks with section names.')
     61         elif i == 1:
     62            pywikibot.stdout('   Checking page for {{SectionLink}} calls.')
     63     
     64      for match in re.finditer(the_pattern, page_text):
     65         found_iw_match = False
     66         iw_url = ""
     67         page_name2 = page_name
     68   
     69         # Cut out the matched text from the page, isolating just the page+section name
     70         target_start = 2 # "[["
     71         target_end = 1 # "|" or "]" (we only match the first ending bracket)
     72         if i == 1:
     73            target_start = 14 # "{{SectionLink|"
     74            target_end = 2 # "}}"
     75         s = match.start() + target_start # remove the link-opening markup
     76         e = match.end() - target_end # remove the link-ending markup
     77         link_text = page_text[s:e]
     78         
     79         # The second link type will look like "Page|Section" or "|Section", so fix that pipe
     80         if i == 1:
     81            link_text = link_text.replace('|', '#')
     82
     83         # Sometimes we use a space char. instead of a '_', so fix that before querying
     84         link_text = link_text.replace(' ', '_')
     85         if debug: pywikibot.stdout('      Found link {0}.'.format(link_text))
     86     
     87         # If this link doesn't have a section link in it, then we don't care about it, as
     88         # MediaWiki takes care of checking basic intrawiki links
     89         if not '#' in link_text:
     90            if debug: pywikibot.stdout('         Link doesn\'t have a section anchor in it. Skipping.')
    7191            continue
    7292
    73         # If this link has an interwiki prefix, it can be ignored
    74         is_interwiki = False
    75         if found_iw_match == False:
     93         # If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
     94         # for the task of checking interwiki page+section links
     95         is_interwiki = False
     96         if found_iw_match == False:
    7697            for prefix in interwiki_prefixes:
    77                 if prefix + ":" in link_text:
    78                     #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
    79                     is_interwiki = True
    80                     break
    81         if is_interwiki:
     98               if prefix + ":" in link_text:
     99                  if debug: pywikibot.stdout('         Skipping link {} because it is an interwiki link.'.format(link_text))
     100                  is_interwiki = True
     101                  break
     102         if is_interwiki:
    82103            continue
    83        
    84         # If there is a '{' in the link, then probably it's a link built on transcluded text
    85         # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
    86         if '{' in link_text:
    87             pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
    88             advice_issued = advice_issued + 1
    89             continue
    90 
    91         # If this is a relative "/" link, use the current page as the basis for the URL. Note
    92         # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
    93         # we're out of luck.
    94         if link_text.startswith('/'):
     104     
     105         # If there is a '{' in the link, then probably it's a link built on transcluded text
     106         # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
     107         if '{' in link_text:
     108            ch_link_pattern = re.compile(r"{{C[0-9]*}}")
     109            ch_link = ch_link_pattern.search(link_text)
     110            if debug: pywikibot.stdout('         Found transclusion in link: "{}".'.format(ch_link.group(0)))
     111            if ch_link:
     112               ch_link_match = ch_link.group(0)
     113               ch_num_pattern = re.compile("[0-9]+")
     114               ch_num = ch_num_pattern.search(ch_link_match)
     115               if ch_num:
     116                  ch_num_match = int(ch_num.group(0))
     117                  if ch_num_match >= 0 and ch_num_match <= 14:
     118                     ch_name = chapter_names[ch_num_match]
     119                     replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
     120                     link_text = replace_pattern.sub(ch_name, link_text)
     121                     if debug: pywikibot.stdout('         After performing transclusion, link is now "{}".'.format(link_text))
     122                  else:
     123                     if not name_printed and not debug:
     124                        pywikibot.stdout('From page "{}":'.format(page_name))
     125                        name_printed = 1
     126                     pywikibot.stdout('   ADVICE: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(link_text, ch_num_match))
     127                     advice_issued += 1
     128                     continue
     129               else:
     130                  if not name_printed and not debug:
     131                     pywikibot.stdout('From page "{}":'.format(page_name))
     132                     name_printed = 1
     133                  pywikibot.stdout('   ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(link_text))
     134                  advice_issued += 1
     135                  continue
     136            else:
     137               if not name_printed and not debug:
     138                  pywikibot.stdout('From page "{}":'.format(page_name))
     139                  name_printed = 1
     140               pywikibot.stdout('   ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(link_text, "{{C7}}"))
     141               advice_issued += 1
     142               continue
     143
     144         # If this is a relative "/" link, use the current page as the basis for the URL. Note
     145         # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
     146         # we're out of luck.
     147         if link_text.startswith('/'):
    95148            link_text = page_name + link_text
    96             #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
    97        
    98         # If this is a relative "../" link, find the parent page and set ourselves to that page,
    99         # then remove the relative portion of the link. Note that this is only performed once,
    100         # so if there's multiple steps back ("../../"), we're out of luck.
    101         if link_text.startswith('../'):
     149            if debug: pywikibot.stdout('         Changed link_text to {} on account of "/".'.format(link_text))
     150     
     151         # If this is a relative "../" link, find the parent page, set ourselves to that page,
     152         # then remove the relative portion of the link. Note that this is only performed once,
     153         # so if there's multiple steps back ("../../"), we're out of luck.
     154         if link_text.startswith('../'):
    102155            last_slash = page_name.rfind('/')
    103156            page_name2 = page_name[0:last_slash]
    104             #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
     157            if debug: pywikibot.stdout('         Changed page_name to {} on account of "../".'.format(page_name2))
    105158            link_text = link_text[3:len(link_text)]
    106             #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
     159            if debug: pywikibot.stdout('         Changed link_text to {} on account of "../".'.format(link_text))
    107160            # If this is now going to be a bare section link for the parent page, don't add a
    108161            # slash, otherwise do because we are drilling down to another subpage
    109162            if link_text.startswith('#'):
    110                 link_text = page_name2 + link_text
     163               link_text = page_name2 + link_text
    111164            else:
    112                 link_text = page_name2 + '/' + link_text
    113             
    114         # If this is a bare section link, build URL based on this page
    115         if link_text.startswith('#'):
     165               link_text = page_name2 + '/' + link_text
     166         
     167         # If this is a bare section link, build URL based on this page
     168         if link_text.startswith('#'):
    116169            iw_url = onigalore_url + page_name2
    117             iw_found = iw_found + 1
    118             #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
     170            iw_found += 1
     171            if debug: pywikibot.stdout('         Found link to this very page, {}.'.format(link_text))
    119172            found_iw_match = True
    120173            link_text = page_name2 + link_text
    121        
    122         # If there's no ":" in the link (before the section link, where a colon would just be
    123         # part of the text) then it's a Main namespace article, so construct URL
    124         if found_iw_match == False:
     174     
     175         # If there's no ":" in the link (before the section link, where a colon would just be
     176         # part of the text) then it's a Main namespace article; proceed with building URL
     177         if found_iw_match == False:
    125178            if not re.search(":.*#", link_text):
    126                 iw_url = onigalore_url + link_text
    127                 iw_found = iw_found + 1
    128                 #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
    129                 found_iw_match = True
    130            
    131         # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
    132         if found_iw_match == False:
     179               iw_url = onigalore_url + link_text
     180               iw_found += 1
     181               if debug: pywikibot.stdout('         Link is to a Main namespace page.')
     182               found_iw_match = True
     183         
     184         # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
     185         # before building URL
     186         if found_iw_match == False:
    133187            for prefix in intrawiki_prefixes:
    134                 #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
    135                 if prefix + ":" in link_text:
    136                     iw_url = onigalore_url + link_text
    137                     _, post_ns = link_text.split(':', 1)
    138                     #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
    139                     iw_found = iw_found + 1
    140                     found_iw_match = True
    141                     break
    142        
    143         # If we still haven't turned this match into a URL, something's gone wrong
    144         if (found_iw_match == False) or (iw_url == ""):
    145             pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
     188               if prefix + ":" in link_text:
     189                  iw_url = onigalore_url + link_text
     190                  if debug: pywikibot.stdout('         Identified namespace {}.'.format(prefix))
     191                  iw_found += 1
     192                  found_iw_match = True
     193                  break
     194     
     195         # If we still haven't turned this match into a URL, something's gone wrong
     196         if (found_iw_match == False) or (iw_url == ""):
     197            if not name_printed and not debug:
     198               pywikibot.stdout('From page "{}":'.format(page_name))
     199               name_printed = 1
     200            pywikibot.stdout('   ERROR: Couldn\'t figure out link {}.'.format(link_text))
    146201            continue
    147202
    148         # Test the URL
    149         iw_url = iw_url.replace(' ', '_')
    150         #pywikibot.stdout('Reading page at {}...'.format(iw_url))
    151         response = fetch(iw_url)
    152 
    153         # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
    154         # tell that a redirect occurred is by checking fetch's history
    155         if response.history != []:
    156             pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
    157             warnings_issued = warnings_issued + 1
    158         elif response.status_code != 200:
    159             pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
    160             warnings_issued = warnings_issued + 1
    161         else:
     203         # Test the URL
     204         iw_url = iw_url.replace(' ', '_')
     205         if debug: pywikibot.stdout('         Reading page at {}...'.format(iw_url))
     206         response = fetch(iw_url)
     207
     208         # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
     209         # tell that a redirect occurred is by checking fetch's history
     210         if response.history != []:
     211            if not name_printed and not debug:
     212               pywikibot.stdout('From page "{}":'.format(page_name))
     213               name_printed = 1
     214            pywikibot.stdout('   ADVICE: Got redirection code ({0}) on URL "{1}". You should check the link manually.'.format(response.history[0], iw_url))
     215            advice_issued += 1
     216         elif response.status_code != 200:
     217            if not name_printed and not debug:
     218               pywikibot.stdout('From page "{}":'.format(page_name))
     219               name_printed = 1
     220            pywikibot.stdout('   ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
     221            errors_issued += 1
     222         else:
    162223            # Isolate section link
    163224            pre_section, section_name = link_text.split('#', 1)
    164             #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
    165             
     225            if debug: pywikibot.stdout('         Searching for section link {} on page.'.format(section_name))
     226         
    166227            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
    167228            section_name = section_name.replace('/', '.2F')
    168             
     229         
    169230            # Read linked page to see if it really has this anchor link
    170231            soup = BeautifulSoup(response.text, 'html.parser')
    171232            found_section = False
    172233            for span_tag in soup.findAll('span'):
    173                 span_name = span_tag.get('id', None)
    174                 if span_name == section_name:
    175                     #pywikibot.stdout('Found section!')
    176                     found_section = True
    177                     break
     234               span_name = span_tag.get('id', None)
     235               if span_name == section_name:
     236                  if debug: pywikibot.stdout('         Found section!')
     237                  found_section = True
     238                  break
    178239            if found_section == False:
    179                 pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
    180                 errors_issued = errors_issued + 1
     240               if not name_printed and not debug:
     241                  pywikibot.stdout('From page "{}":'.format(page_name))
     242                  name_printed = 1
     243               pywikibot.stdout('   ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
     244               errors_issued += 1
    181245
    182246def main(*args):
    183     cat_name = ''
    184     global page_name
    185 
    186     local_args = pywikibot.handle_args(args)
    187     genFactory = pagegenerators.GeneratorFactory()
    188 
    189     for arg in local_args:
    190         if arg.startswith('-cat:'):
    191             cat_name = arg[5:]
    192         elif arg.startswith('-page:'):
    193             page_name = arg[6:]
    194 
    195     site = pywikibot.Site()
    196 
    197     # This line of code enumerates the methods in the 'page' class
    198     #pywikibot.stdout(format(dir(page)))
    199 
    200     if cat_name != '':
    201         cat_obj = pywikibot.Category(site, cat_name)
    202         generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
    203         for page in pagegenerators.PreloadingGenerator(generator, 100):
    204             pywikibot.stdout('Checking page {0}'.format(page.title()))
    205             page_name = page.title()
    206             scan_for_iw_links(page.text)
    207     elif page_name != '':
    208         page = pywikibot.Page(site, page_name)
    209         pywikibot.stdout('Checking page {0}'.format(page.title()))
    210         scan_for_iw_links(page.text)
    211 
    212     global pages_checked
    213     global iw_found
    214     global advice_issued
    215     global warnings_issued
    216     global errors_issued
    217 
    218     page_str = "pages"
    219     if pages_checked == 1:
    220         page_str = "page"
    221 
    222     link_str = "links"
    223     if iw_found == 1:
    224         link_str = "link"
    225 
    226     pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
    227     pywikibot.stdout('While attempting to follow section links...')
    228 
    229     if advice_issued == 0:
    230         pywikibot.stdout('  No advice on potential problems was issued.')
    231     elif advice_issued == 1:
    232         pywikibot.stdout('  1 piece of advice on a potential problem was issued.')
    233     else:
    234         pywikibot.stdout('  {} pieces of advice on potential problems were issued.'.format(advice_issued))
    235 
    236     warning_str = "warnings were"
    237     if warnings_issued == 1:
    238         warning_str = "warning was"
    239     pywikibot.stdout('  {0} {1} issued.'.format(warnings_issued, warning_str))
    240 
    241     error_str = "errors were"
    242     if errors_issued == 1:
    243         error_str = "error was"
    244     pywikibot.stdout('  {0} {1} encountered.'.format(errors_issued, error_str))
     247   global debug
     248   global pages_checked
     249   global iw_found
     250   global advice_issued
     251   global errors_issued
     252   search_cat = ''
     253   search_page = ''
     254
     255   local_args = pywikibot.handle_args(args)
     256   genFactory = pagegenerators.GeneratorFactory()
     257
     258   for arg in local_args:
     259      if arg.startswith('-cat:'):
     260         search_cat = arg[5:]
     261      elif arg.startswith('-page:'):
     262         search_page = arg[6:]
     263      elif arg == '-dbg':
     264         debug = 1
     265      else:
     266         pywikibot.stdout('Unknown argument "{}".'.format(arg))
     267         return
     268
     269   site = pywikibot.Site()
     270
     271   # This line of code enumerates the methods in the 'page' class
     272   #pywikibot.stdout(format(dir(page)))
     273
     274   if search_cat != '':
     275      cat_obj = pywikibot.Category(site, search_cat)
     276      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
     277      for page in pagegenerators.PreloadingGenerator(generator, 100):
     278         if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
     279         scan_for_intrawiki_links(page.text, page.title())
     280   elif search_page != '':
     281      page = pywikibot.Page(site, search_page)
     282      if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
     283      scan_for_intrawiki_links(page.text, page.title())
     284
     285   page_str = "pages"
     286   if pages_checked == 1:
     287      page_str = "page"
     288
     289   link_str = "links"
     290   if iw_found == 1:
     291      link_str = "link"
     292
     293   pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
     294   pywikibot.stdout('While attempting to follow section links...')
     295
     296   if advice_issued == 0:
     297      pywikibot.stdout('   No advice on potential problems was issued.')
     298   elif advice_issued == 1:
     299      pywikibot.stdout('   1 piece of advice on a potential problem was issued.')
     300   else:
     301      pywikibot.stdout('   {} pieces of advice on potential problems were issued.'.format(advice_issued))
     302
     303   error_str = "errors were"
     304   if errors_issued == 1:
     305      error_str = "error was"
     306   pywikibot.stdout('   {0} {1} encountered.'.format(errors_issued, error_str))
    245307
    246308if __name__ == '__main__':
    247     main()
     309   main()
Note: See TracChangeset for help on using the changeset viewer.