Context Navigation

← Previous Change
Next Change →

Changeset 1173 for ValBot

Timestamp:

Jun 29, 2022, 12:06:29 AM (3 years ago)

Author:

iritscen

Message:

ValBot: check_intrawiki_section_links.py won't quit when a link cannot be understood; it will just move on. find_external_images.py is now polished and robust.

Location:

ValBot

Files:

: 4 edited

Docs/Pywikibot commands.rtf (modified) ( previous)
Docs/Read-me.rtf (modified) ( previous)
Python/check_intrawiki_section_links.py (modified) (2 diffs)
Python/find_external_images.py (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

ValBot/Python/check_intrawiki_section_links.py

-              r1171
+              r1173
         if link_text.startswith('/'):
             link_text = page_name + link_text
             pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
+            #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
         # If this is a relative "../" link, find the parent page and set ourselves to that page,
 …
         # If we still haven't turned this match into a URL, something's gone wrong
         if (found_iw_match == False) or (iw_url == ""):
             pywikibot.stdout('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
             quit()
+            pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
+            continue
         # Test the URL

ValBot/Python/find_external_images.py

-              r1169
+              r1173
+# Find External Images
+# by iritscen@yahoo.com
+# Looks at each link on a page (or in all the pages in a category) and prints the links to
+# images that are externally-hosted. You must pass in one or both of the following args:
+# -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
+# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
+# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
+#
+# Recommended viewing width:
+# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
 import os
 …
 from pywikibot.comms.http import fetch
 from pywikibot.specialbots import UploadRobot
+#import bs4 # for listing members with dir()
 from bs4 import BeautifulSoup
-first_run = False
 pages_checked = 0
+page_errors = 0
+ext_images = 0
 oni2_images = 0
 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
+tag_names = []
 # Scrapes the HTML at the given URL for image tags
+def get_image_links(url, shown):
+    links = []
+def get_image_links(url):
+    global pages_checked
+    global page_errors
+    global ext_images
     global oni2_images
+    global pages_checked
+    global file_formats
+    global tag_names
     response = fetch(url)
     if response.status_code != 200:
+        pywikibot.output('Skipping url: {}'.format(url))
+        return links
+        pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url))
+        page_errors = page_errors + 1
+        return
     soup = BeautifulSoup(response.text, 'html.parser')
     pages_checked = pages_checked + 1
+    if not shown:
+        tagname = 'a'
+    elif shown == 'just':
+        tagname = 'img'
+    else:
+        tagname = ['a', 'img']
+    #pywikibot.output('Looking at tags.')
+    for tag in soup.findAll(tagname):
+        link = tag.get('src', tag.get('href', None))
+    for tag in soup.findAll(tag_names):
+        link = tag.get('href')
         if not link:
+            #pywikibot.output('It is not a link.')
+            link = tag.get('src')
+        # Filter out empty links
+        if not link:
+            if tag.get('id') == "top":
+                continue
+            class_names = tag.get('class')
+            if "selflink" in class_names:
+                continue
+            pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text))
+            pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
             continue
+        #pywikibot.output('Got link {0}.'.format(link))
+        # A "src" or "href" starting with "/" would be a link to a local page or file; a
+        # link starting with "#" is a section link
+        if link.startswith('/') or link.startswith('#'):
+            continue
+        # The gnu.org link to the Free Documentation License is at the bottom of every page
+        if link == "http://www.gnu.org/copyleft/fdl.html":
+            continue
         _, ext = os.path.splitext(link)
         if ext.lower() in file_formats:
-            pywikibot.output('Found image link {0}.'.format(ext))
             if "oni2.net" in link:
                 pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
+                pywikibot.stdout('   Oni2.net image: {}'.format(link))
                 oni2_images = oni2_images + 1
+    return links
+            else:
+                pywikibot.stdout('   External image: {}'.format(link))
+                ext_images = ext_images + 1
+        #else:
+           #pywikibot.stdout('   Other external link: {}'.format(link))
 def main(*args):
+    cat = ''
+    url = ''
+    image_url = False
+    shown = False
+    desc = []
+    global pages_checked
+    global page_errors
+    global ext_images
+    global oni2_images
+    global tag_names
+    cat_name = ''
+    page_name = ''
+    #pywikibot.stdout('The members of the bs4.element.Tag class are:')
+    #pywikibot.stdout(format(dir(bs4.element.Tag)))
     local_args = pywikibot.handle_args(args)
 …
     for arg in local_args:
         if arg.startswith('-cat:'):
             cat = arg[5:]
         elif arg == '-shown':
             shown = True
         elif arg == '-justshown':
             shown = 'just'
         elif url == '':
             url = arg
+            cat_name = arg[5:]
+        elif arg.startswith('-page:'):
+            page_name = arg[6:]
+        elif arg == '-linked':
+            tag_names += ['a']
+        elif arg == '-inlined':
+            tag_names += ['img']
         else:
+            desc += [arg]
+    desc = ' '.join(desc)
+            pywikibot.stdout('Unknown argument "{}".'.format(arg))
+            return
+    if not tag_names:
+        pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
+        return
     site = pywikibot.Site()
+    cat_obj = pywikibot.Category(site, cat)
+    generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
+    for page in pagegenerators.PreloadingGenerator(generator, 100):
+        pywikibot.stdout('Checking page {0}'.format(page.title()))
+    if cat_name != '':
+        cat_obj = pywikibot.Category(site, cat_name)
+        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
+        for page in pagegenerators.PreloadingGenerator(generator, 100):
+            pywikibot.stdout('Checking page "{}"'.format(page.title()))
+            page_url = page.full_url().replace("%2F", "/")
+            get_image_links(page_url)
+    elif page_name != '':
+        page = pywikibot.Page(site, page_name)
+        pywikibot.stdout('Checking page "{}"'.format(page.title()))
         page_url = page.full_url().replace("%2F", "/")
+        get_image_links(page_url, shown)
+        get_image_links(page_url)
+    else:
+        pywikibot.stdout('No page name or category name received.'.format(arg))
+        return
+    global pages_checked
+    global oni2_images
+    pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
+    chk_page_str = "pages"
+    if pages_checked == 1:
+        chk_page_str = "page"
+    err_page_str = "pages"
+    if page_errors == 1:
+        err_page_str = "page"
+    ext_image_str = "images"
+    if ext_images == 1:
+        ext_image_str = "image"
+    oni2_image_str = "images"
+    if oni2_images == 1:
+        oni2_image_str = "image"
+    pywikibot.stdout('-------------------------')
+    pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
+    pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
 if __name__ == '__main__':

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1173 for ValBot

Legend:

ValBot/Python/check_intrawiki_section_links.py

ValBot/Python/find_external_images.py

Download in other formats: