Changeset 1152 for ValBot


Ignore:
Timestamp:
Mar 31, 2021, 6:29:48 PM (4 years ago)
Author:
iritscen
Message:

ValBot: Now checks that the section link (if there is one) within an interwiki link is valid.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • ValBot/check_interwiki_links.py

    r1151 r1152  
    4343            # Construct full URL for the particular wiki
    4444            iw_url = interwiki_urls[cur] + page_title
    45             pywikibot.output('Found {0} link {1}'.format(prefix, page_title))
     45            pywikibot.output('Found {0} link {1}.'.format(prefix, page_title))
    4646            iw_found = iw_found + 1
    4747
     
    5555
    5656            # Test the URL
    57             #pywikibot.output('Testing URL {}'.format(iw_url))
     57            #pywikibot.output('Testing URL {}...'.format(iw_url))
    5858            response = fetch(iw_url)
    5959
     
    6161            # way we tell that a redirect occurred is by checking the history
    6262            if response.history != []:
    63                 pywikibot.output('WARNING: Initially got {}.'.format(response.history))
     63                pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
    6464                problems_found = problems_found + 1
    6565            elif response.status_code != 200:
    6666                #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
    6767                problems_found = problems_found + 1
     68            elif '#' in page_title:
     69                # Isolate section link
     70                pywikibot.output('Detected section link on page {0}.'.format(page_title))
     71                page_name, anchor_name = page_title.split('#')
     72               
     73                # Convert dot-notation hex entities to proper characters
     74                anchor_name = anchor_name.replace('.22', '"')
     75                anchor_name = anchor_name.replace('.27', '\'')
     76                anchor_name = anchor_name.replace('.28', '(')
     77                anchor_name = anchor_name.replace('.29', ')')
     78               
     79                # Read linked page to see if it really has this anchor link
     80                soup = BeautifulSoup(response.text, 'html.parser')
     81                found_section = False
     82                for tag in soup.findAll('a'):
     83                    link = tag.get('href', None)
     84                    if not link:
     85                        #pywikibot.output('It is not a link.')
     86                        continue
     87                    #pywikibot.output('Got link {0}.'.format(link))
     88                    if not link.startswith('#'):
     89                        continue
     90                       
     91                    if link == '#' + anchor_name:
     92                        pywikibot.output('Found section link!')
     93                        found_section = True
     94                        break
     95                if found_section == False:
     96                    pywikibot.output('Could not find section {0} on page {1}.'.format(anchor_name, page_name))
     97                    problems_found = problems_found + 1
    6898        cur = cur + 1
    6999
Note: See TracChangeset for help on using the changeset viewer.