Ignore:
Timestamp:
Apr 28, 2023, 2:55:00 AM (19 months ago)
Author:
iritscen
Message:

ValBot: find_external_images.py: Changed argument "-inlined" to "-embedded". Now clearly distinguishing linked from embedded images. Placed some output under a "-dbg" argument.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • ValBot/Python/find_external_images.py

    r1173 r1181  
    22# by iritscen@yahoo.com
    33# Looks at each link on a page (or in all the pages in a category) and prints the links to
    4 # images that are externally-hosted. You must pass in one or both of the following args:
    5 # -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
    6 # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
     4# images that are external to the wiki. Distinction is made between images hosted on oni2.net
     5# and on third-party domains. You must pass in one or both of the following args:
     6#  -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
     7#  -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
     8#
    79# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
    810#
     
    1517
    1618import pywikibot
    17 
    1819from pywikibot.bot import QuitKeyboardInterrupt
    1920from pywikibot import pagegenerators
    2021from pywikibot.comms.http import fetch
    2122from pywikibot.specialbots import UploadRobot
    22 #import bs4 # for listing members with dir()
     23
     24import bs4
    2325from bs4 import BeautifulSoup
    2426
     27# Initialize globals
     28debug = 0
    2529pages_checked = 0
    2630page_errors = 0
    27 ext_images = 0
    28 oni2_images = 0
     31image_errors = 0
     32linked_ext_images = 0
     33linked_oni2_images = 0
     34embedded_ext_images = 0
     35embedded_oni2_images = 0
    2936file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
    3037tag_names = []
    3138
     39# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
     40def plural_check(noun, quantity):
     41   if quantity != 1:
     42      return noun + "s"
     43   else:
     44      return noun
     45
    3246# Scrapes the HTML at the given URL for image tags
    33 def get_image_links(url):
    34     global pages_checked
    35     global page_errors
    36     global ext_images
    37     global oni2_images
    38     global file_formats
    39     global tag_names
    40 
    41     response = fetch(url)
    42     if response.status_code != 200:
    43         pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url))
    44         page_errors = page_errors + 1
    45         return
    46 
    47     soup = BeautifulSoup(response.text, 'html.parser')
    48     pages_checked = pages_checked + 1
    49     for tag in soup.findAll(tag_names):
    50         link = tag.get('href')
    51         if not link:
    52             link = tag.get('src')
    53 
    54         # Filter out empty links
    55         if not link:
    56             if tag.get('id') == "top":
    57                 continue
    58 
    59             class_names = tag.get('class')
    60             if "selflink" in class_names:
    61                 continue
    62 
    63             pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text))
    64             pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
     47def get_image_links(page_url, page_name):
     48   global debug
     49   global pages_checked
     50   global page_errors
     51   global image_errors
     52   global linked_ext_images
     53   global linked_oni2_images
     54   global embedded_ext_images
     55   global embedded_oni2_images
     56   global file_formats
     57   global tag_names
     58   name_printed = 0
     59
     60   response = fetch(page_url)
     61   if response.status_code != 200:
     62      pywikibot.stdout('   ERROR: Could not load page at URL "{}".'.format(page_url))
     63      page_errors += 1
     64      return
     65
     66   soup = BeautifulSoup(response.text, 'html.parser')
     67   pages_checked += 1
     68   for tag in soup.findAll(tag_names):
     69      link = tag.get('href')
     70      if not link:
     71         link = tag.get('src')
     72
     73      # Filter out empty links
     74      if not link:
     75         if tag.get('id') == "top":
    6576            continue
    6677
    67         # A "src" or "href" starting with "/" would be a link to a local page or file; a
    68         # link starting with "#" is a section link
    69         if link.startswith('/') or link.startswith('#'):
     78         class_names = tag.get('class')
     79         if "selflink" in class_names:
    7080            continue
    7181
    72         # The gnu.org link to the Free Documentation License is at the bottom of every page
    73         if link == "http://www.gnu.org/copyleft/fdl.html":
    74             continue
    75 
    76         _, ext = os.path.splitext(link)
    77         if ext.lower() in file_formats:
    78             if "oni2.net" in link:
    79                 pywikibot.stdout('   Oni2.net image: {}'.format(link))
    80                 oni2_images = oni2_images + 1
     82         if not name_printed and not debug:
     83            pywikibot.stdout('From page "{}":'.format(page_name))
     84            name_printed = 1
     85         pywikibot.stdout('   ERROR: Could not process mystery link {}.'.format(tag.get_text))
     86         pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
     87         page_errors += 1
     88         continue
     89
     90      # A "src" or "href" starting with "/" would be a link to a local page or file; a
     91      # link starting with "#" is a section link
     92      if link.startswith('/') or link.startswith('#'):
     93         continue
     94
     95      # The gnu.org link to the Free Documentation License is at the bottom of every page
     96      if link == "http://www.gnu.org/copyleft/fdl.html":
     97         continue
     98
     99      # Determine if link is to an image
     100      _, ext = os.path.splitext(link)
     101      if ext.lower() in file_formats:
     102         if not name_printed and not debug:
     103            pywikibot.stdout('Found on page "{}":'.format(page_name))
     104            name_printed = 1
     105         tag_text = format(tag)
     106         if "oni2.net" in link:
     107            if tag_text.startswith('<a'):
     108               pywikibot.stdout('   Linked oni2.net image: {}'.format(link))
     109               linked_oni2_images += 1
     110            elif tag_text.startswith('<img'):
     111               pywikibot.stdout('   Embedded oni2.net image: {}'.format(link))
     112               embedded_oni2_images += 1
    81113            else:
    82                 pywikibot.stdout('   External image: {}'.format(link))
    83                 ext_images = ext_images + 1
    84         #else:
    85            #pywikibot.stdout('   Other external link: {}'.format(link))
     114               pywikibot.stdout('   ERROR: Could not process oni2.net image link {}.'.format(link))
     115               image_errors += 1
     116               return
     117         else:
     118            if tag_text.startswith('<a'):
     119               pywikibot.stdout('   Linked external image: {}'.format(link))
     120               linked_ext_images += 1
     121            elif tag_text.startswith('<img'):
     122               pywikibot.stdout('   Embedded external image: {}'.format(link))
     123               embedded_ext_images += 1
     124            else:
     125               pywikibot.stdout('   ERROR: Could not process external image link {}.'.format(link))
     126               image_errors += 1
     127               return
    86128
    87129def main(*args):
    88     global pages_checked
    89     global page_errors
    90     global ext_images
    91     global oni2_images
    92     global tag_names
    93 
    94     cat_name = ''
    95     page_name = ''
    96 
    97     #pywikibot.stdout('The members of the bs4.element.Tag class are:')
    98     #pywikibot.stdout(format(dir(bs4.element.Tag)))
    99 
    100     local_args = pywikibot.handle_args(args)
    101     genFactory = pagegenerators.GeneratorFactory()
    102 
    103     for arg in local_args:
    104         if arg.startswith('-cat:'):
    105             cat_name = arg[5:]
    106         elif arg.startswith('-page:'):
    107             page_name = arg[6:]
    108         elif arg == '-linked':
    109             tag_names += ['a']
    110         elif arg == '-inlined':
    111             tag_names += ['img']
    112         else:
    113             pywikibot.stdout('Unknown argument "{}".'.format(arg))
    114             return
    115 
    116     if not tag_names:
    117         pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
    118         return
    119 
    120     site = pywikibot.Site()
    121     if cat_name != '':
    122         cat_obj = pywikibot.Category(site, cat_name)
    123         generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
    124         for page in pagegenerators.PreloadingGenerator(generator, 100):
    125             pywikibot.stdout('Checking page "{}"'.format(page.title()))
    126             page_url = page.full_url().replace("%2F", "/")
    127             get_image_links(page_url)
    128     elif page_name != '':
    129         page = pywikibot.Page(site, page_name)
    130         pywikibot.stdout('Checking page "{}"'.format(page.title()))
    131         page_url = page.full_url().replace("%2F", "/")
    132         get_image_links(page_url)
    133     else:
    134         pywikibot.stdout('No page name or category name received.'.format(arg))
    135         return
    136 
    137     chk_page_str = "pages"
    138     if pages_checked == 1:
    139         chk_page_str = "page"
    140 
    141     err_page_str = "pages"
    142     if page_errors == 1:
    143         err_page_str = "page"
    144 
    145     ext_image_str = "images"
    146     if ext_images == 1:
    147         ext_image_str = "image"
    148 
    149     oni2_image_str = "images"
    150     if oni2_images == 1:
    151         oni2_image_str = "image"
    152 
    153     pywikibot.stdout('-------------------------')
    154     pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
    155     pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
     130   global debug
     131   global pages_checked
     132   global page_errors
     133   global image_errors
     134   global linked_ext_images
     135   global linked_oni2_images
     136   global embedded_ext_images
     137   global embedded_oni2_images
     138   global tag_names
     139
     140   search_cat = ''
     141   search_page = ''
     142
     143   #pywikibot.stdout('The members of the bs4.element.Tag class are:')
     144   #pywikibot.stdout(format(dir(bs4.element.Tag)))
     145
     146   local_args = pywikibot.handle_args(args)
     147   genFactory = pagegenerators.GeneratorFactory()
     148
     149   for arg in local_args:
     150      if arg.startswith('-cat:'):
     151         search_cat = arg[5:]
     152      elif arg.startswith('-page:'):
     153         search_page = arg[6:]
     154      elif arg == '-linked':
     155         tag_names += ['a']
     156      elif arg == '-embedded':
     157         tag_names += ['img']
     158      elif arg == '-dbg':
     159         debug = 1
     160      else:
     161         pywikibot.stdout('Unknown argument "{}".'.format(arg))
     162         return
     163
     164   if not tag_names:
     165      pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
     166      return
     167
     168   site = pywikibot.Site()
     169   if search_cat != '':
     170      cat_obj = pywikibot.Category(site, search_cat)
     171      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
     172      for page in pagegenerators.PreloadingGenerator(generator, 100):
     173         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
     174         page_url = page.full_url().replace("%2F", "/")
     175         get_image_links(page_url, page.title())
     176   elif search_page != '':
     177      page = pywikibot.Page(site, search_page)
     178      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
     179      page_url = page.full_url().replace("%2F", "/")
     180      get_image_links(page_url, page.title())
     181   else:
     182      pywikibot.stdout('No page name or category name received.'.format(arg))
     183      return
     184
     185   chk_page_str = plural_check("page", pages_checked)
     186   err_page_str = plural_check("page", page_errors)
     187   err_img_str = plural_check("image", image_errors)
     188   linked_ext_image_str = plural_check("image", linked_ext_images)
     189   linked_oni2_image_str = plural_check("image", linked_oni2_images)
     190   embedded_ext_image_str = plural_check("image", embedded_ext_images)
     191   embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
     192
     193   pywikibot.stdout('-------------------------')
     194   pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
     195   if 'a' in tag_names:
     196      pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
     197   if 'img' in tag_names:
     198      pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
    156199
    157200if __name__ == '__main__':
    158     main()
     201   main()
Note: See TracChangeset for help on using the changeset viewer.