1 | # Find External Images
|
---|
2 | # by iritscen@yahoo.com
|
---|
3 | # Looks at each link on a page (or in all the pages in a category) and prints the links to
|
---|
4 | # images that are external to the wiki. Distinction is made between images hosted on oni2.net
|
---|
5 | # and on third-party domains. You must pass in one or both of the following args:
|
---|
6 | # -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
|
---|
7 | # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
|
---|
8 | #
|
---|
9 | # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
|
---|
10 | #
|
---|
11 | # Recommended viewing width:
|
---|
12 | # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
|
---|
13 |
|
---|
14 | import os
|
---|
15 |
|
---|
16 | from urllib.parse import urljoin
|
---|
17 |
|
---|
18 | import pywikibot
|
---|
19 | from pywikibot.bot import QuitKeyboardInterrupt
|
---|
20 | from pywikibot import pagegenerators
|
---|
21 | from pywikibot.comms.http import fetch
|
---|
22 | from pywikibot.specialbots import UploadRobot
|
---|
23 |
|
---|
24 | import bs4
|
---|
25 | from bs4 import BeautifulSoup
|
---|
26 |
|
---|
27 | # Initialize globals
|
---|
28 | debug = 0
|
---|
29 | pages_checked = 0
|
---|
30 | page_errors = 0
|
---|
31 | image_errors = 0
|
---|
32 | linked_ext_images = 0
|
---|
33 | linked_oni2_images = 0
|
---|
34 | embedded_ext_images = 0
|
---|
35 | embedded_oni2_images = 0
|
---|
36 | file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
|
---|
37 | tag_names = []
|
---|
38 |
|
---|
39 | # Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
|
---|
40 | def plural_check(noun, quantity):
|
---|
41 | if quantity != 1:
|
---|
42 | return noun + "s"
|
---|
43 | else:
|
---|
44 | return noun
|
---|
45 |
|
---|
46 | # Scrapes the HTML at the given URL for image tags
|
---|
47 | def get_image_links(page_url, page_name):
|
---|
48 | global debug
|
---|
49 | global pages_checked
|
---|
50 | global page_errors
|
---|
51 | global image_errors
|
---|
52 | global linked_ext_images
|
---|
53 | global linked_oni2_images
|
---|
54 | global embedded_ext_images
|
---|
55 | global embedded_oni2_images
|
---|
56 | global file_formats
|
---|
57 | global tag_names
|
---|
58 | name_printed = 0
|
---|
59 |
|
---|
60 | response = fetch(page_url)
|
---|
61 | if response.status_code != 200:
|
---|
62 | pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url))
|
---|
63 | page_errors += 1
|
---|
64 | return
|
---|
65 |
|
---|
66 | soup = BeautifulSoup(response.text, 'html.parser')
|
---|
67 | pages_checked += 1
|
---|
68 | for tag in soup.findAll(tag_names):
|
---|
69 | link = tag.get('href')
|
---|
70 | if not link:
|
---|
71 | link = tag.get('src')
|
---|
72 |
|
---|
73 | # Filter out empty links
|
---|
74 | if not link:
|
---|
75 | if tag.get('id') == "top":
|
---|
76 | continue
|
---|
77 |
|
---|
78 | class_names = tag.get('class')
|
---|
79 | if "selflink" in class_names:
|
---|
80 | continue
|
---|
81 |
|
---|
82 | if not name_printed and not debug:
|
---|
83 | pywikibot.stdout('From page "{}":'.format(page_name))
|
---|
84 | name_printed = 1
|
---|
85 | pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text))
|
---|
86 | pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
|
---|
87 | page_errors += 1
|
---|
88 | continue
|
---|
89 |
|
---|
90 | # A "src" or "href" starting with "/" would be a link to a local page or file; a
|
---|
91 | # link starting with "#" is a section link
|
---|
92 | if link.startswith('/') or link.startswith('#'):
|
---|
93 | continue
|
---|
94 |
|
---|
95 | # The gnu.org link to the Free Documentation License is at the bottom of every page
|
---|
96 | if link == "http://www.gnu.org/copyleft/fdl.html":
|
---|
97 | continue
|
---|
98 |
|
---|
99 | # Determine if link is to an image
|
---|
100 | _, ext = os.path.splitext(link)
|
---|
101 | if ext.lower() in file_formats:
|
---|
102 | if not name_printed and not debug:
|
---|
103 | pywikibot.stdout('Found on page "{}":'.format(page_name))
|
---|
104 | name_printed = 1
|
---|
105 | tag_text = format(tag)
|
---|
106 | if "oni2.net" in link:
|
---|
107 | if tag_text.startswith('<a'):
|
---|
108 | pywikibot.stdout(' Linked oni2.net image: {}'.format(link))
|
---|
109 | linked_oni2_images += 1
|
---|
110 | elif tag_text.startswith('<img'):
|
---|
111 | pywikibot.stdout(' Embedded oni2.net image: {}'.format(link))
|
---|
112 | embedded_oni2_images += 1
|
---|
113 | else:
|
---|
114 | pywikibot.stdout(' ERROR: Could not process oni2.net image link {}.'.format(link))
|
---|
115 | image_errors += 1
|
---|
116 | return
|
---|
117 | else:
|
---|
118 | if tag_text.startswith('<a'):
|
---|
119 | pywikibot.stdout(' Linked external image: {}'.format(link))
|
---|
120 | linked_ext_images += 1
|
---|
121 | elif tag_text.startswith('<img'):
|
---|
122 | pywikibot.stdout(' Embedded external image: {}'.format(link))
|
---|
123 | embedded_ext_images += 1
|
---|
124 | else:
|
---|
125 | pywikibot.stdout(' ERROR: Could not process external image link {}.'.format(link))
|
---|
126 | image_errors += 1
|
---|
127 | return
|
---|
128 |
|
---|
129 | def main(*args):
|
---|
130 | global debug
|
---|
131 | global pages_checked
|
---|
132 | global page_errors
|
---|
133 | global image_errors
|
---|
134 | global linked_ext_images
|
---|
135 | global linked_oni2_images
|
---|
136 | global embedded_ext_images
|
---|
137 | global embedded_oni2_images
|
---|
138 | global tag_names
|
---|
139 |
|
---|
140 | search_cat = ''
|
---|
141 | search_page = ''
|
---|
142 |
|
---|
143 | #pywikibot.stdout('The members of the bs4.element.Tag class are:')
|
---|
144 | #pywikibot.stdout(format(dir(bs4.element.Tag)))
|
---|
145 |
|
---|
146 | local_args = pywikibot.handle_args(args)
|
---|
147 | genFactory = pagegenerators.GeneratorFactory()
|
---|
148 |
|
---|
149 | for arg in local_args:
|
---|
150 | if arg.startswith('-cat:'):
|
---|
151 | search_cat = arg[5:]
|
---|
152 | elif arg.startswith('-page:'):
|
---|
153 | search_page = arg[6:]
|
---|
154 | elif arg == '-linked':
|
---|
155 | tag_names += ['a']
|
---|
156 | elif arg == '-embedded':
|
---|
157 | tag_names += ['img']
|
---|
158 | elif arg == '-dbg':
|
---|
159 | debug = 1
|
---|
160 | else:
|
---|
161 | pywikibot.stdout('Unknown argument "{}".'.format(arg))
|
---|
162 | return
|
---|
163 |
|
---|
164 | if not tag_names:
|
---|
165 | pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
|
---|
166 | return
|
---|
167 |
|
---|
168 | site = pywikibot.Site()
|
---|
169 | if search_cat != '':
|
---|
170 | cat_obj = pywikibot.Category(site, search_cat)
|
---|
171 | generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
|
---|
172 | for page in pagegenerators.PreloadingGenerator(generator, 100):
|
---|
173 | if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
|
---|
174 | page_url = page.full_url().replace("%2F", "/")
|
---|
175 | get_image_links(page_url, page.title())
|
---|
176 | elif search_page != '':
|
---|
177 | page = pywikibot.Page(site, search_page)
|
---|
178 | if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
|
---|
179 | page_url = page.full_url().replace("%2F", "/")
|
---|
180 | get_image_links(page_url, page.title())
|
---|
181 | else:
|
---|
182 | pywikibot.stdout('No page name or category name received.'.format(arg))
|
---|
183 | return
|
---|
184 |
|
---|
185 | chk_page_str = plural_check("page", pages_checked)
|
---|
186 | err_page_str = plural_check("page", page_errors)
|
---|
187 | err_img_str = plural_check("image", image_errors)
|
---|
188 | linked_ext_image_str = plural_check("image", linked_ext_images)
|
---|
189 | linked_oni2_image_str = plural_check("image", linked_oni2_images)
|
---|
190 | embedded_ext_image_str = plural_check("image", embedded_ext_images)
|
---|
191 | embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
|
---|
192 |
|
---|
193 | pywikibot.stdout('-------------------------')
|
---|
194 | pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
|
---|
195 | if 'a' in tag_names:
|
---|
196 | pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
|
---|
197 | if 'img' in tag_names:
|
---|
198 | pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
|
---|
199 |
|
---|
200 | if __name__ == '__main__':
|
---|
201 | main()
|
---|