1 | # Find External Images
|
---|
2 | # by iritscen@yahoo.com
|
---|
3 | # Looks at each link on a page (or in all the pages in a category) and prints the links to
|
---|
4 | # images that are externally-hosted. You must pass in one or both of the following args:
|
---|
5 | # -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
|
---|
6 | # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
|
---|
7 | # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
|
---|
8 | #
|
---|
9 | # Recommended viewing width:
|
---|
10 | # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
|
---|
11 |
|
---|
12 | import os
|
---|
13 |
|
---|
14 | from urllib.parse import urljoin
|
---|
15 |
|
---|
16 | import pywikibot
|
---|
17 |
|
---|
18 | from pywikibot.bot import QuitKeyboardInterrupt
|
---|
19 | from pywikibot import pagegenerators
|
---|
20 | from pywikibot.comms.http import fetch
|
---|
21 | from pywikibot.specialbots import UploadRobot
|
---|
22 | #import bs4 # for listing members with dir()
|
---|
23 | from bs4 import BeautifulSoup
|
---|
24 |
|
---|
25 | pages_checked = 0
|
---|
26 | page_errors = 0
|
---|
27 | ext_images = 0
|
---|
28 | oni2_images = 0
|
---|
29 | file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
|
---|
30 | tag_names = []
|
---|
31 |
|
---|
32 | # Scrapes the HTML at the given URL for image tags
|
---|
33 | def get_image_links(url):
|
---|
34 | global pages_checked
|
---|
35 | global page_errors
|
---|
36 | global ext_images
|
---|
37 | global oni2_images
|
---|
38 | global file_formats
|
---|
39 | global tag_names
|
---|
40 |
|
---|
41 | response = fetch(url)
|
---|
42 | if response.status_code != 200:
|
---|
43 | pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
|
---|
44 | page_errors = page_errors + 1
|
---|
45 | return
|
---|
46 |
|
---|
47 | soup = BeautifulSoup(response.text, 'html.parser')
|
---|
48 | pages_checked = pages_checked + 1
|
---|
49 | for tag in soup.findAll(tag_names):
|
---|
50 | link = tag.get('href')
|
---|
51 | if not link:
|
---|
52 | link = tag.get('src')
|
---|
53 |
|
---|
54 | # Filter out empty links
|
---|
55 | if not link:
|
---|
56 | if tag.get('id') == "top":
|
---|
57 | continue
|
---|
58 |
|
---|
59 | class_names = tag.get('class')
|
---|
60 | if "selflink" in class_names:
|
---|
61 | continue
|
---|
62 |
|
---|
63 | pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
|
---|
64 | pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
|
---|
65 | continue
|
---|
66 |
|
---|
67 | # A "src" or "href" starting with "/" would be a link to a local page or file; a
|
---|
68 | # link starting with "#" is a section link
|
---|
69 | if link.startswith('/') or link.startswith('#'):
|
---|
70 | continue
|
---|
71 |
|
---|
72 | # The gnu.org link to the Free Documentation License is at the bottom of every page
|
---|
73 | if link == "http://www.gnu.org/copyleft/fdl.html":
|
---|
74 | continue
|
---|
75 |
|
---|
76 | _, ext = os.path.splitext(link)
|
---|
77 | if ext.lower() in file_formats:
|
---|
78 | if "oni2.net" in link:
|
---|
79 | pywikibot.stdout(' Oni2.net image: {}'.format(link))
|
---|
80 | oni2_images = oni2_images + 1
|
---|
81 | else:
|
---|
82 | pywikibot.stdout(' External image: {}'.format(link))
|
---|
83 | ext_images = ext_images + 1
|
---|
84 | #else:
|
---|
85 | #pywikibot.stdout(' Other external link: {}'.format(link))
|
---|
86 |
|
---|
87 | def main(*args):
|
---|
88 | global pages_checked
|
---|
89 | global page_errors
|
---|
90 | global ext_images
|
---|
91 | global oni2_images
|
---|
92 | global tag_names
|
---|
93 |
|
---|
94 | cat_name = ''
|
---|
95 | page_name = ''
|
---|
96 |
|
---|
97 | #pywikibot.stdout('The members of the bs4.element.Tag class are:')
|
---|
98 | #pywikibot.stdout(format(dir(bs4.element.Tag)))
|
---|
99 |
|
---|
100 | local_args = pywikibot.handle_args(args)
|
---|
101 | genFactory = pagegenerators.GeneratorFactory()
|
---|
102 |
|
---|
103 | for arg in local_args:
|
---|
104 | if arg.startswith('-cat:'):
|
---|
105 | cat_name = arg[5:]
|
---|
106 | elif arg.startswith('-page:'):
|
---|
107 | page_name = arg[6:]
|
---|
108 | elif arg == '-linked':
|
---|
109 | tag_names += ['a']
|
---|
110 | elif arg == '-inlined':
|
---|
111 | tag_names += ['img']
|
---|
112 | else:
|
---|
113 | pywikibot.stdout('Unknown argument "{}".'.format(arg))
|
---|
114 | return
|
---|
115 |
|
---|
116 | if not tag_names:
|
---|
117 | pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
|
---|
118 | return
|
---|
119 |
|
---|
120 | site = pywikibot.Site()
|
---|
121 | if cat_name != '':
|
---|
122 | cat_obj = pywikibot.Category(site, cat_name)
|
---|
123 | generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
|
---|
124 | for page in pagegenerators.PreloadingGenerator(generator, 100):
|
---|
125 | pywikibot.stdout('Checking page "{}"'.format(page.title()))
|
---|
126 | page_url = page.full_url().replace("%2F", "/")
|
---|
127 | get_image_links(page_url)
|
---|
128 | elif page_name != '':
|
---|
129 | page = pywikibot.Page(site, page_name)
|
---|
130 | pywikibot.stdout('Checking page "{}"'.format(page.title()))
|
---|
131 | page_url = page.full_url().replace("%2F", "/")
|
---|
132 | get_image_links(page_url)
|
---|
133 | else:
|
---|
134 | pywikibot.stdout('No page name or category name received.'.format(arg))
|
---|
135 | return
|
---|
136 |
|
---|
137 | chk_page_str = "pages"
|
---|
138 | if pages_checked == 1:
|
---|
139 | chk_page_str = "page"
|
---|
140 |
|
---|
141 | err_page_str = "pages"
|
---|
142 | if page_errors == 1:
|
---|
143 | err_page_str = "page"
|
---|
144 |
|
---|
145 | ext_image_str = "images"
|
---|
146 | if ext_images == 1:
|
---|
147 | ext_image_str = "image"
|
---|
148 |
|
---|
149 | oni2_image_str = "images"
|
---|
150 | if oni2_images == 1:
|
---|
151 | oni2_image_str = "image"
|
---|
152 |
|
---|
153 | pywikibot.stdout('-------------------------')
|
---|
154 | pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
|
---|
155 | pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
|
---|
156 |
|
---|
157 | if __name__ == '__main__':
|
---|
158 | main()
|
---|