[1173] | 1 | # Find External Images
|
---|
| 2 | # by iritscen@yahoo.com
|
---|
| 3 | # Looks at each link on a page (or in all the pages in a category) and prints the links to
|
---|
| 4 | # images that are externally-hosted. You must pass in one or both of the following args:
|
---|
| 5 | # -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
|
---|
| 6 | # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
|
---|
| 7 | # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
|
---|
| 8 | #
|
---|
| 9 | # Recommended viewing width:
|
---|
| 10 | # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
|
---|
| 11 |
|
---|
[1169] | 12 | import os
|
---|
| 13 |
|
---|
| 14 | from urllib.parse import urljoin
|
---|
| 15 |
|
---|
| 16 | import pywikibot
|
---|
| 17 |
|
---|
| 18 | from pywikibot.bot import QuitKeyboardInterrupt
|
---|
| 19 | from pywikibot import pagegenerators
|
---|
| 20 | from pywikibot.comms.http import fetch
|
---|
| 21 | from pywikibot.specialbots import UploadRobot
|
---|
[1173] | 22 | #import bs4 # for listing members with dir()
|
---|
[1169] | 23 | from bs4 import BeautifulSoup
|
---|
| 24 |
|
---|
| 25 | pages_checked = 0
|
---|
[1173] | 26 | page_errors = 0
|
---|
| 27 | ext_images = 0
|
---|
[1169] | 28 | oni2_images = 0
|
---|
| 29 | file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
|
---|
[1173] | 30 | tag_names = []
|
---|
[1169] | 31 |
|
---|
| 32 | # Scrapes the HTML at the given URL for image tags
|
---|
[1173] | 33 | def get_image_links(url):
|
---|
| 34 | global pages_checked
|
---|
| 35 | global page_errors
|
---|
| 36 | global ext_images
|
---|
[1169] | 37 | global oni2_images
|
---|
[1173] | 38 | global file_formats
|
---|
| 39 | global tag_names
|
---|
[1169] | 40 |
|
---|
| 41 | response = fetch(url)
|
---|
| 42 | if response.status_code != 200:
|
---|
[1173] | 43 | pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
|
---|
| 44 | page_errors = page_errors + 1
|
---|
| 45 | return
|
---|
[1169] | 46 |
|
---|
| 47 | soup = BeautifulSoup(response.text, 'html.parser')
|
---|
| 48 | pages_checked = pages_checked + 1
|
---|
[1173] | 49 | for tag in soup.findAll(tag_names):
|
---|
| 50 | link = tag.get('href')
|
---|
[1169] | 51 | if not link:
|
---|
[1173] | 52 | link = tag.get('src')
|
---|
| 53 |
|
---|
| 54 | # Filter out empty links
|
---|
| 55 | if not link:
|
---|
| 56 | if tag.get('id') == "top":
|
---|
| 57 | continue
|
---|
| 58 |
|
---|
| 59 | class_names = tag.get('class')
|
---|
| 60 | if "selflink" in class_names:
|
---|
| 61 | continue
|
---|
| 62 |
|
---|
| 63 | pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
|
---|
| 64 | pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
|
---|
[1169] | 65 | continue
|
---|
[1173] | 66 |
|
---|
| 67 | # A "src" or "href" starting with "/" would be a link to a local page or file; a
|
---|
| 68 | # link starting with "#" is a section link
|
---|
| 69 | if link.startswith('/') or link.startswith('#'):
|
---|
| 70 | continue
|
---|
| 71 |
|
---|
| 72 | # The gnu.org link to the Free Documentation License is at the bottom of every page
|
---|
| 73 | if link == "http://www.gnu.org/copyleft/fdl.html":
|
---|
| 74 | continue
|
---|
| 75 |
|
---|
[1169] | 76 | _, ext = os.path.splitext(link)
|
---|
| 77 | if ext.lower() in file_formats:
|
---|
| 78 | if "oni2.net" in link:
|
---|
[1173] | 79 | pywikibot.stdout(' Oni2.net image: {}'.format(link))
|
---|
[1169] | 80 | oni2_images = oni2_images + 1
|
---|
[1173] | 81 | else:
|
---|
| 82 | pywikibot.stdout(' External image: {}'.format(link))
|
---|
| 83 | ext_images = ext_images + 1
|
---|
| 84 | #else:
|
---|
| 85 | #pywikibot.stdout(' Other external link: {}'.format(link))
|
---|
[1169] | 86 |
|
---|
| 87 | def main(*args):
|
---|
[1173] | 88 | global pages_checked
|
---|
| 89 | global page_errors
|
---|
| 90 | global ext_images
|
---|
| 91 | global oni2_images
|
---|
| 92 | global tag_names
|
---|
[1169] | 93 |
|
---|
[1173] | 94 | cat_name = ''
|
---|
| 95 | page_name = ''
|
---|
| 96 |
|
---|
| 97 | #pywikibot.stdout('The members of the bs4.element.Tag class are:')
|
---|
| 98 | #pywikibot.stdout(format(dir(bs4.element.Tag)))
|
---|
| 99 |
|
---|
[1169] | 100 | local_args = pywikibot.handle_args(args)
|
---|
| 101 | genFactory = pagegenerators.GeneratorFactory()
|
---|
| 102 |
|
---|
| 103 | for arg in local_args:
|
---|
| 104 | if arg.startswith('-cat:'):
|
---|
[1173] | 105 | cat_name = arg[5:]
|
---|
| 106 | elif arg.startswith('-page:'):
|
---|
| 107 | page_name = arg[6:]
|
---|
| 108 | elif arg == '-linked':
|
---|
| 109 | tag_names += ['a']
|
---|
| 110 | elif arg == '-inlined':
|
---|
| 111 | tag_names += ['img']
|
---|
[1169] | 112 | else:
|
---|
[1173] | 113 | pywikibot.stdout('Unknown argument "{}".'.format(arg))
|
---|
| 114 | return
|
---|
[1169] | 115 |
|
---|
[1173] | 116 | if not tag_names:
|
---|
| 117 | pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
|
---|
| 118 | return
|
---|
| 119 |
|
---|
[1169] | 120 | site = pywikibot.Site()
|
---|
[1173] | 121 | if cat_name != '':
|
---|
| 122 | cat_obj = pywikibot.Category(site, cat_name)
|
---|
| 123 | generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
|
---|
| 124 | for page in pagegenerators.PreloadingGenerator(generator, 100):
|
---|
| 125 | pywikibot.stdout('Checking page "{}"'.format(page.title()))
|
---|
| 126 | page_url = page.full_url().replace("%2F", "/")
|
---|
| 127 | get_image_links(page_url)
|
---|
| 128 | elif page_name != '':
|
---|
| 129 | page = pywikibot.Page(site, page_name)
|
---|
| 130 | pywikibot.stdout('Checking page "{}"'.format(page.title()))
|
---|
[1169] | 131 | page_url = page.full_url().replace("%2F", "/")
|
---|
[1173] | 132 | get_image_links(page_url)
|
---|
| 133 | else:
|
---|
| 134 | pywikibot.stdout('No page name or category name received.'.format(arg))
|
---|
| 135 | return
|
---|
[1169] | 136 |
|
---|
[1173] | 137 | chk_page_str = "pages"
|
---|
| 138 | if pages_checked == 1:
|
---|
| 139 | chk_page_str = "page"
|
---|
[1169] | 140 |
|
---|
[1173] | 141 | err_page_str = "pages"
|
---|
| 142 | if page_errors == 1:
|
---|
| 143 | err_page_str = "page"
|
---|
| 144 |
|
---|
| 145 | ext_image_str = "images"
|
---|
| 146 | if ext_images == 1:
|
---|
| 147 | ext_image_str = "image"
|
---|
| 148 |
|
---|
| 149 | oni2_image_str = "images"
|
---|
| 150 | if oni2_images == 1:
|
---|
| 151 | oni2_image_str = "image"
|
---|
| 152 |
|
---|
| 153 | pywikibot.stdout('-------------------------')
|
---|
| 154 | pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
|
---|
| 155 | pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
|
---|
| 156 |
|
---|
[1169] | 157 | if __name__ == '__main__':
|
---|
| 158 | main()
|
---|