Changeset 1194 for ValBot


Ignore:
Timestamp:
Nov 18, 2024, 5:00:08 AM (5 weeks ago)
Author:
iritscen
Message:

ValBot: check_intrawiki_section_links.py now understands text fragment directives.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • ValBot/Python/check_intrawiki_section_links.py

    r1192 r1194  
    5959def find_section(page_text, page_name, page_slug, print_result):
    6060   global errors_issued
    61 
    62    # Isolate section link
     61   found_section = False
     62   
     63   # Isolate section link or text fragment link
    6364   target_page_name, anchor_name = page_slug.split('#', 1)
    6465   target_page_name_human = target_page_name.replace('_', ' ')
    65    if debug: pywikibot.stdout('         Searching for section link {} on page.'.format(anchor_name))
    66 
    67    # Read linked page to see if it really has this anchor link
    68    soup = BeautifulSoup(page_text, 'html.parser')
    69    found_section = False
    70    for span_tag in soup.findAll('span'):
    71       span_name = span_tag.get('id', None)
    72       if span_name == anchor_name:
    73          if debug and not print_result: pywikibot.stdout('         Found section in a span!')
     66   
     67   # First check if this is a text fragment directive, and look for it if so
     68   if anchor_name.startswith(':~:text='):
     69      if debug: pywikibot.stdout('         Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug))
     70      anchor_name = anchor_name[8:]
     71      # We're only checking the first text directive, so strip add'l ones if present
     72      addl_fragment = anchor_name.find('&text=')
     73      if addl_fragment != -1:
     74         anchor_name = anchor_name[:addl_fragment]
     75      search_terms = anchor_name.split(',')
     76      # Delete prefix and suffix terms because they aren't needed
     77      if search_terms[0].endswith('-'):
     78         search_terms.pop(0)
     79      if search_terms[-1].startswith('-'):
     80         search_terms.pop()
     81      # Remake text directive with the terms separated by spaces as they should be in the page text
     82      newSep = ' '
     83      search_string = newSep.join(search_terms)
     84      if debug: pywikibot.stdout('         Converted text fragment to string "{}".'.format(search_string))
     85      if search_string in page_text:
    7486         found_section = True
    75          break
     87         if debug and not print_result: pywikibot.stdout('         Found text fragment!')
     88   
     89   # If we're still here, it's a section link; read linked page to see if it really has this
     90   # anchor link
     91   if found_section == False:
     92      if debug: pywikibot.stdout('         Searching for section link {} on page.'.format(anchor_name))
     93      soup = BeautifulSoup(page_text, 'html.parser')
     94      # Search for a span with this ID
     95      for span_tag in soup.findAll('span'):
     96         span_name = span_tag.get('id', None)
     97         if span_name == anchor_name:
     98            if debug and not print_result: pywikibot.stdout('         Found section in a span!')
     99            found_section = True
     100            break
    76101   if found_section == False:
    77102      # Search for a div with this ID
     
    128153   # such as Special:PermanentLink.
    129154   if response.history != []:
    130      
    131155      permalink1 = 'Special:PermanentLink/'.lower()
    132156      permalink2 = 'Special:Permalink/'.lower()
Note: See TracChangeset for help on using the changeset viewer.