Changeset 1173 for ValBot/Python


Ignore:
Timestamp:
Jun 29, 2022, 12:06:29 AM (2 years ago)
Author:
iritscen
Message:

ValBot: check_intrawiki_section_links.py won't quit when a link cannot be understood; it will just move on. find_external_images.py is now polished and robust.

Location:
ValBot/Python
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • ValBot/Python/check_intrawiki_section_links.py

    r1171 r1173  
    8383        if link_text.startswith('/'):
    8484            link_text = page_name + link_text
    85             pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
     85            #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
    8686       
    8787        # If this is a relative "../" link, find the parent page and set ourselves to that page,
     
    144144        # If we still haven't turned this match into a URL, something's gone wrong
    145145        if (found_iw_match == False) or (iw_url == ""):
    146             pywikibot.stdout('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
    147             quit()
     146            pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
     147            continue
    148148
    149149        # Test the URL
  • ValBot/Python/find_external_images.py

    r1169 r1173  
     1# Find External Images
     2# by iritscen@yahoo.com
     3# Looks at each link on a page (or in all the pages in a category) and prints the links to
     4# images that are externally-hosted. You must pass in one or both of the following args:
     5# -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
     6# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
     7# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
     8#
     9# Recommended viewing width:
     10# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
     11
    112import os
    213
     
    920from pywikibot.comms.http import fetch
    1021from pywikibot.specialbots import UploadRobot
     22#import bs4 # for listing members with dir()
    1123from bs4 import BeautifulSoup
    1224
    13 first_run = False
    1425pages_checked = 0
     26page_errors = 0
     27ext_images = 0
    1528oni2_images = 0
    1629file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
     30tag_names = []
    1731
    1832# Scrapes the HTML at the given URL for image tags
    19 def get_image_links(url, shown):
    20     links = []
     33def get_image_links(url):
     34    global pages_checked
     35    global page_errors
     36    global ext_images
    2137    global oni2_images
    22     global pages_checked
     38    global file_formats
     39    global tag_names
    2340
    2441    response = fetch(url)
    2542    if response.status_code != 200:
    26         pywikibot.output('Skipping url: {}'.format(url))
    27         return links
     43        pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url))
     44        page_errors = page_errors + 1
     45        return
    2846
    2947    soup = BeautifulSoup(response.text, 'html.parser')
    3048    pages_checked = pages_checked + 1
    31     if not shown:
    32         tagname = 'a'
    33     elif shown == 'just':
    34         tagname = 'img'
    35     else:
    36         tagname = ['a', 'img']
    37     #pywikibot.output('Looking at tags.')
    38     for tag in soup.findAll(tagname):
    39         link = tag.get('src', tag.get('href', None))
     49    for tag in soup.findAll(tag_names):
     50        link = tag.get('href')
    4051        if not link:
    41             #pywikibot.output('It is not a link.')
     52            link = tag.get('src')
     53
     54        # Filter out empty links
     55        if not link:
     56            if tag.get('id') == "top":
     57                continue
     58
     59            class_names = tag.get('class')
     60            if "selflink" in class_names:
     61                continue
     62
     63            pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text))
     64            pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
    4265            continue
    43         #pywikibot.output('Got link {0}.'.format(link))
     66
     67        # A "src" or "href" starting with "/" would be a link to a local page or file; a
     68        # link starting with "#" is a section link
     69        if link.startswith('/') or link.startswith('#'):
     70            continue
     71
     72        # The gnu.org link to the Free Documentation License is at the bottom of every page
     73        if link == "http://www.gnu.org/copyleft/fdl.html":
     74            continue
     75
    4476        _, ext = os.path.splitext(link)
    4577        if ext.lower() in file_formats:
    46             pywikibot.output('Found image link {0}.'.format(ext))
    4778            if "oni2.net" in link:
    48                 pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
     79                pywikibot.stdout('   Oni2.net image: {}'.format(link))
    4980                oni2_images = oni2_images + 1
    50     return links
    51 
     81            else:
     82                pywikibot.stdout('   External image: {}'.format(link))
     83                ext_images = ext_images + 1
     84        #else:
     85           #pywikibot.stdout('   Other external link: {}'.format(link))
    5286
    5387def main(*args):
    54     cat = ''
    55     url = ''
    56     image_url = False
    57     shown = False
    58     desc = []
     88    global pages_checked
     89    global page_errors
     90    global ext_images
     91    global oni2_images
     92    global tag_names
     93
     94    cat_name = ''
     95    page_name = ''
     96
     97    #pywikibot.stdout('The members of the bs4.element.Tag class are:')
     98    #pywikibot.stdout(format(dir(bs4.element.Tag)))
    5999
    60100    local_args = pywikibot.handle_args(args)
     
    63103    for arg in local_args:
    64104        if arg.startswith('-cat:'):
    65             cat = arg[5:]
    66         elif arg == '-shown':
    67             shown = True
    68         elif arg == '-justshown':
    69             shown = 'just'
    70         elif url == '':
    71             url = arg
     105            cat_name = arg[5:]
     106        elif arg.startswith('-page:'):
     107            page_name = arg[6:]
     108        elif arg == '-linked':
     109            tag_names += ['a']
     110        elif arg == '-inlined':
     111            tag_names += ['img']
    72112        else:
    73             desc += [arg]
    74     desc = ' '.join(desc)
     113            pywikibot.stdout('Unknown argument "{}".'.format(arg))
     114            return
     115
     116    if not tag_names:
     117        pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
     118        return
    75119
    76120    site = pywikibot.Site()
    77     cat_obj = pywikibot.Category(site, cat)
    78     generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
    79     for page in pagegenerators.PreloadingGenerator(generator, 100):
    80         pywikibot.stdout('Checking page {0}'.format(page.title()))
     121    if cat_name != '':
     122        cat_obj = pywikibot.Category(site, cat_name)
     123        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
     124        for page in pagegenerators.PreloadingGenerator(generator, 100):
     125            pywikibot.stdout('Checking page "{}"'.format(page.title()))
     126            page_url = page.full_url().replace("%2F", "/")
     127            get_image_links(page_url)
     128    elif page_name != '':
     129        page = pywikibot.Page(site, page_name)
     130        pywikibot.stdout('Checking page "{}"'.format(page.title()))
    81131        page_url = page.full_url().replace("%2F", "/")
    82         get_image_links(page_url, shown)
     132        get_image_links(page_url)
     133    else:
     134        pywikibot.stdout('No page name or category name received.'.format(arg))
     135        return
    83136
    84     global pages_checked
    85     global oni2_images
    86     pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
     137    chk_page_str = "pages"
     138    if pages_checked == 1:
     139        chk_page_str = "page"
     140
     141    err_page_str = "pages"
     142    if page_errors == 1:
     143        err_page_str = "page"
     144
     145    ext_image_str = "images"
     146    if ext_images == 1:
     147        ext_image_str = "image"
     148
     149    oni2_image_str = "images"
     150    if oni2_images == 1:
     151        oni2_image_str = "image"
     152
     153    pywikibot.stdout('-------------------------')
     154    pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
     155    pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
    87156
    88157if __name__ == '__main__':
Note: See TracChangeset for help on using the changeset viewer.