Index: /ValBot/check_intrawiki_section_links.py
===================================================================
--- /ValBot/check_intrawiki_section_links.py	(revision 1153)
+++ /ValBot/check_intrawiki_section_links.py	(revision 1153)
@@ -0,0 +1,200 @@
+import os
+
+from urllib.parse import urljoin
+
+import pywikibot
+import re
+
+from pywikibot.bot import QuitKeyboardInterrupt
+from pywikibot import pagegenerators
+from pywikibot.tools.formatter import color_format
+from pywikibot.comms.http import fetch
+from pywikibot.specialbots import UploadRobot
+from bs4 import BeautifulSoup
+
+# Array of OniGalore's namespaces
+intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
+
+# URL for main namespace of our wiki
+onigalore_url = 'https://wiki.oni2.net/'
+
+# Interwiki prefixes, for ruling out these links
+interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
+
+pages_checked = 0
+iw_found = 0
+problems_found = 0
+page_name = ''
+
+# Searches the given page text for intrawiki links with section links in them
+def scan_for_iw_links(page_text):
+    global pages_checked
+    global iw_found
+    global problems_found
+    global page_name
+    pages_checked = pages_checked + 1
+
+    # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
+    # "[[any:thi|ng]]"
+    iw_link = "\[\[[^|\]]*(\||\])"
+    for match in re.finditer(iw_link, page_text):
+        found_iw_match = False
+        iw_url = ""
+        page_name2 = page_name
+    
+        # Cut out the matched text from the page, and in the process remove the "[[" from the
+        # front and the "|" or "]" from the end
+        s = match.start() + 2
+        e = match.end() - 1
+        link_text = page_text[s:e]
+
+        # Sometimes we used a space char. instead of a '_', so fix that before querying
+        link_text = link_text.replace(' ', '_')
+        #pywikibot.output('Found link {0}.'.format(link_text))
+        
+        # If this link doesn't have a section link in it, then we don't care about it, as
+        # MediaWiki takes care of checking basic intrawiki links
+        if not '#' in link_text:
+            #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
+            continue
+        
+        # If there is a '{' in the link, then probably it's a link built on transcluded text
+        # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
+        if '{' in link_text:
+            pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
+            continue
+        
+        # If this is a relative "../" link, find the parent page and set ourselves to that
+        # page, then remove the relative portion of the link. Note that this is only performed
+        # once, so if there's multiple steps back ("../../"), we're out of luck.
+        if link_text.startswith('../'):
+            last_slash = page_name.rfind('/')
+            page_name2 = page_name[0:last_slash]
+            #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
+            link_text = link_text[3:len(link_text)]
+            #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
+            # If this is now going to be a bare section link for the parent page, don't add
+            # a slash, otherwise do because we are drilling down to another subpage
+            if link_text.startswith('#'):
+                link_text = page_name2 + link_text
+            else:
+                link_text = page_name2 + '/' + link_text
+            
+        # If this is a bare section link, build URL based on this page
+        if link_text.startswith('#'):
+            iw_url = onigalore_url + page_name2
+            iw_found = iw_found + 1
+            #pywikibot.output('Found link to this very page, {}.'.format(link_text))
+            found_iw_match = True
+            link_text = page_name2 + link_text
+        
+        # If there's no ":" in the link (before the section link, where a colon would just be
+        # part of the text) then it's a Main namespace article, so construct URL
+        #if not ':' in link_text:
+        if found_iw_match == False:
+            if not re.search(":.*#", link_text):
+                iw_url = onigalore_url + link_text
+                iw_found = iw_found + 1
+                #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
+                found_iw_match = True
+            
+        # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
+        if found_iw_match == False:
+            for prefix in intrawiki_prefixes:
+                #pywikibot.output('Comparing link against prefix {}.'.format(prefix))
+                if prefix + ":" in link_text:
+                    iw_url = onigalore_url + link_text
+                    _, post_ns = link_text.split(':', 1)
+                    #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
+                    iw_found = iw_found + 1
+                    found_iw_match = True
+                    break
+        
+        # If we didn't match the prefix against any intrawiki prefixes, see if it matches
+        # against an interwiki prefix; if so, this link can be ignored
+        is_interwiki = False
+        if found_iw_match == False:
+            for prefix in interwiki_prefixes:
+                if prefix + ":" in link_text:
+                    #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
+                    is_interwiki = True
+                    break
+        if is_interwiki:
+            continue
+        
+        # If we still haven't turned this match into a URL, something's gone wrong
+        if (found_iw_match == False) or (iw_url == ""):
+            pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
+            quit()
+
+        # Test the URL
+        iw_url = iw_url.replace(' ', '_')
+        #pywikibot.output('Reading page at {}...'.format(iw_url))
+        response = fetch(iw_url)
+
+        # Redirects are followed automatically by fetch() and treated as "200"s, so the
+        # way we tell that a redirect occurred is by checking the history
+        if response.history != []:
+            pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
+            problems_found = problems_found + 1
+        elif response.status_code != 200:
+            #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
+            problems_found = problems_found + 1
+        else:
+            # Isolate section link
+            pre_section, section_name = link_text.split('#', 1)
+            #pywikibot.output('Searching for section link {} on page.'.format(section_name))
+            
+            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
+            section_name = section_name.replace('/', '.2F')
+            
+            # Read linked page to see if it really has this anchor link
+            soup = BeautifulSoup(response.text, 'html.parser')
+            found_section = False
+            for span_tag in soup.findAll('span'):
+                span_name = span_tag.get('id', None)
+                if span_name == section_name:
+                    #pywikibot.output('Found section!')
+                    found_section = True
+                    break
+            if found_section == False:
+                pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
+                problems_found = problems_found + 1
+
+def main(*args):
+    cat_name = ''
+    global page_name
+
+    local_args = pywikibot.handle_args(args)
+    genFactory = pagegenerators.GeneratorFactory()
+
+    for arg in local_args:
+        if arg.startswith('-cat:'):
+            cat_name = arg[5:]
+        elif arg.startswith('-page:'):
+            page_name = arg[6:]
+
+    site = pywikibot.Site()
+
+    # This line of code enumerates the methods in the 'page' class
+    #pywikibot.stdout(format(dir(page)))
+
+    if cat_name != '':
+        cat_obj = pywikibot.Category(site, cat_name)
+        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
+        for page in pagegenerators.PreloadingGenerator(generator, 100):
+            pywikibot.stdout('Checking page {0}'.format(page.title()))
+            page_name = page.title()
+            scan_for_iw_links(page.text)
+    elif page_name != '':
+        page = pywikibot.Page(site, page_name)
+        pywikibot.stdout('Checking page {0}'.format(page.title()))
+        scan_for_iw_links(page.text)
+
+    global pages_checked
+    global iw_found
+    global problems_found
+    pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))
+
+if __name__ == '__main__':
+    main()