[1173] | 1 | # Find External Images
|
---|
| 2 | # by iritscen@yahoo.com
|
---|
| 3 | # Looks at each link on a page (or in all the pages in a category) and prints the links to
|
---|
[1181] | 4 | # images that are external to the wiki. Distinction is made between images hosted on oni2.net
|
---|
| 5 | # and on third-party domains. You must pass in one or both of the following args:
|
---|
| 6 | # -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
|
---|
| 7 | # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
|
---|
| 8 | #
|
---|
[1173] | 9 | # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
|
---|
| 10 | #
|
---|
| 11 | # Recommended viewing width:
|
---|
| 12 | # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
|
---|
| 13 |
|
---|
[1169] | 14 | import os
|
---|
| 15 |
|
---|
| 16 | from urllib.parse import urljoin
|
---|
| 17 |
|
---|
| 18 | import pywikibot
|
---|
| 19 | from pywikibot.bot import QuitKeyboardInterrupt
|
---|
| 20 | from pywikibot import pagegenerators
|
---|
| 21 | from pywikibot.comms.http import fetch
|
---|
| 22 | from pywikibot.specialbots import UploadRobot
|
---|
[1181] | 23 |
|
---|
| 24 | import bs4
|
---|
[1169] | 25 | from bs4 import BeautifulSoup
|
---|
| 26 |
|
---|
[1181] | 27 | # Initialize globals
|
---|
| 28 | debug = 0
|
---|
[1169] | 29 | pages_checked = 0
|
---|
[1173] | 30 | page_errors = 0
|
---|
[1181] | 31 | image_errors = 0
|
---|
| 32 | linked_ext_images = 0
|
---|
| 33 | linked_oni2_images = 0
|
---|
| 34 | embedded_ext_images = 0
|
---|
| 35 | embedded_oni2_images = 0
|
---|
[1169] | 36 | file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
|
---|
[1173] | 37 | tag_names = []
|
---|
[1169] | 38 |
|
---|
[1181] | 39 | # Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
|
---|
| 40 | def plural_check(noun, quantity):
|
---|
| 41 | if quantity != 1:
|
---|
| 42 | return noun + "s"
|
---|
| 43 | else:
|
---|
| 44 | return noun
|
---|
| 45 |
|
---|
[1169] | 46 | # Scrapes the HTML at the given URL for image tags
|
---|
[1181] | 47 | def get_image_links(page_url, page_name):
|
---|
| 48 | global debug
|
---|
| 49 | global pages_checked
|
---|
| 50 | global page_errors
|
---|
| 51 | global image_errors
|
---|
| 52 | global linked_ext_images
|
---|
| 53 | global linked_oni2_images
|
---|
| 54 | global embedded_ext_images
|
---|
| 55 | global embedded_oni2_images
|
---|
| 56 | global file_formats
|
---|
| 57 | global tag_names
|
---|
| 58 | name_printed = 0
|
---|
[1169] | 59 |
|
---|
[1181] | 60 | response = fetch(page_url)
|
---|
| 61 | if response.status_code != 200:
|
---|
| 62 | pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url))
|
---|
| 63 | page_errors += 1
|
---|
| 64 | return
|
---|
[1169] | 65 |
|
---|
[1181] | 66 | soup = BeautifulSoup(response.text, 'html.parser')
|
---|
| 67 | pages_checked += 1
|
---|
| 68 | for tag in soup.findAll(tag_names):
|
---|
| 69 | link = tag.get('href')
|
---|
| 70 | if not link:
|
---|
| 71 | link = tag.get('src')
|
---|
[1173] | 72 |
|
---|
[1181] | 73 | # Filter out empty links
|
---|
| 74 | if not link:
|
---|
| 75 | if tag.get('id') == "top":
|
---|
[1169] | 76 | continue
|
---|
[1173] | 77 |
|
---|
[1181] | 78 | class_names = tag.get('class')
|
---|
| 79 | if "selflink" in class_names:
|
---|
[1173] | 80 | continue
|
---|
| 81 |
|
---|
[1181] | 82 | if not name_printed and not debug:
|
---|
| 83 | pywikibot.stdout('From page "{}":'.format(page_name))
|
---|
| 84 | name_printed = 1
|
---|
| 85 | pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text))
|
---|
| 86 | pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
|
---|
| 87 | page_errors += 1
|
---|
| 88 | continue
|
---|
[1173] | 89 |
|
---|
[1181] | 90 | # A "src" or "href" starting with "/" would be a link to a local page or file; a
|
---|
| 91 | # link starting with "#" is a section link
|
---|
| 92 | if link.startswith('/') or link.startswith('#'):
|
---|
| 93 | continue
|
---|
| 94 |
|
---|
| 95 | # The gnu.org link to the Free Documentation License is at the bottom of every page
|
---|
| 96 | if link == "http://www.gnu.org/copyleft/fdl.html":
|
---|
| 97 | continue
|
---|
| 98 |
|
---|
| 99 | # Determine if link is to an image
|
---|
| 100 | _, ext = os.path.splitext(link)
|
---|
| 101 | if ext.lower() in file_formats:
|
---|
| 102 | if not name_printed and not debug:
|
---|
| 103 | pywikibot.stdout('Found on page "{}":'.format(page_name))
|
---|
| 104 | name_printed = 1
|
---|
| 105 | tag_text = format(tag)
|
---|
| 106 | if "oni2.net" in link:
|
---|
| 107 | if tag_text.startswith('<a'):
|
---|
| 108 | pywikibot.stdout(' Linked oni2.net image: {}'.format(link))
|
---|
| 109 | linked_oni2_images += 1
|
---|
| 110 | elif tag_text.startswith('<img'):
|
---|
| 111 | pywikibot.stdout(' Embedded oni2.net image: {}'.format(link))
|
---|
| 112 | embedded_oni2_images += 1
|
---|
[1173] | 113 | else:
|
---|
[1181] | 114 | pywikibot.stdout(' ERROR: Could not process oni2.net image link {}.'.format(link))
|
---|
| 115 | image_errors += 1
|
---|
| 116 | return
|
---|
| 117 | else:
|
---|
| 118 | if tag_text.startswith('<a'):
|
---|
| 119 | pywikibot.stdout(' Linked external image: {}'.format(link))
|
---|
| 120 | linked_ext_images += 1
|
---|
| 121 | elif tag_text.startswith('<img'):
|
---|
| 122 | pywikibot.stdout(' Embedded external image: {}'.format(link))
|
---|
| 123 | embedded_ext_images += 1
|
---|
| 124 | else:
|
---|
| 125 | pywikibot.stdout(' ERROR: Could not process external image link {}.'.format(link))
|
---|
| 126 | image_errors += 1
|
---|
| 127 | return
|
---|
[1169] | 128 |
|
---|
| 129 | def main(*args):
|
---|
[1181] | 130 | global debug
|
---|
| 131 | global pages_checked
|
---|
| 132 | global page_errors
|
---|
| 133 | global image_errors
|
---|
| 134 | global linked_ext_images
|
---|
| 135 | global linked_oni2_images
|
---|
| 136 | global embedded_ext_images
|
---|
| 137 | global embedded_oni2_images
|
---|
| 138 | global tag_names
|
---|
[1169] | 139 |
|
---|
[1181] | 140 | search_cat = ''
|
---|
| 141 | search_page = ''
|
---|
[1173] | 142 |
|
---|
[1181] | 143 | #pywikibot.stdout('The members of the bs4.element.Tag class are:')
|
---|
| 144 | #pywikibot.stdout(format(dir(bs4.element.Tag)))
|
---|
[1173] | 145 |
|
---|
[1181] | 146 | local_args = pywikibot.handle_args(args)
|
---|
| 147 | genFactory = pagegenerators.GeneratorFactory()
|
---|
[1169] | 148 |
|
---|
[1181] | 149 | for arg in local_args:
|
---|
| 150 | if arg.startswith('-cat:'):
|
---|
| 151 | search_cat = arg[5:]
|
---|
| 152 | elif arg.startswith('-page:'):
|
---|
| 153 | search_page = arg[6:]
|
---|
| 154 | elif arg == '-linked':
|
---|
| 155 | tag_names += ['a']
|
---|
| 156 | elif arg == '-embedded':
|
---|
| 157 | tag_names += ['img']
|
---|
| 158 | elif arg == '-dbg':
|
---|
| 159 | debug = 1
|
---|
| 160 | else:
|
---|
| 161 | pywikibot.stdout('Unknown argument "{}".'.format(arg))
|
---|
| 162 | return
|
---|
[1169] | 163 |
|
---|
[1181] | 164 | if not tag_names:
|
---|
| 165 | pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
|
---|
| 166 | return
|
---|
[1173] | 167 |
|
---|
[1181] | 168 | site = pywikibot.Site()
|
---|
| 169 | if search_cat != '':
|
---|
| 170 | cat_obj = pywikibot.Category(site, search_cat)
|
---|
| 171 | generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
|
---|
| 172 | for page in pagegenerators.PreloadingGenerator(generator, 100):
|
---|
| 173 | if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
|
---|
| 174 | page_url = page.full_url().replace("%2F", "/")
|
---|
| 175 | get_image_links(page_url, page.title())
|
---|
| 176 | elif search_page != '':
|
---|
| 177 | page = pywikibot.Page(site, search_page)
|
---|
| 178 | if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
|
---|
| 179 | page_url = page.full_url().replace("%2F", "/")
|
---|
| 180 | get_image_links(page_url, page.title())
|
---|
| 181 | else:
|
---|
| 182 | pywikibot.stdout('No page name or category name received.'.format(arg))
|
---|
| 183 | return
|
---|
[1169] | 184 |
|
---|
[1181] | 185 | chk_page_str = plural_check("page", pages_checked)
|
---|
| 186 | err_page_str = plural_check("page", page_errors)
|
---|
| 187 | err_img_str = plural_check("image", image_errors)
|
---|
| 188 | linked_ext_image_str = plural_check("image", linked_ext_images)
|
---|
| 189 | linked_oni2_image_str = plural_check("image", linked_oni2_images)
|
---|
| 190 | embedded_ext_image_str = plural_check("image", embedded_ext_images)
|
---|
| 191 | embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
|
---|
[1169] | 192 |
|
---|
[1181] | 193 | pywikibot.stdout('-------------------------')
|
---|
| 194 | pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
|
---|
| 195 | if 'a' in tag_names:
|
---|
| 196 | pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
|
---|
| 197 | if 'img' in tag_names:
|
---|
| 198 | pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
|
---|
[1173] | 199 |
|
---|
[1169] | 200 | if __name__ == '__main__':
|
---|
[1181] | 201 | main()
|
---|