source: ValBot/Python/find_external_images.py@ 1193

Last change on this file since 1193 was 1181, checked in by iritscen, 19 months ago

ValBot: find_external_images.py: Changed argument "-inlined" to "-embedded". Now clearly distinguishing linked from embedded images. Placed some output under a "-dbg" argument.

File size: 7.3 KB
Line 
1# Find External Images
2# by iritscen@yahoo.com
3# Looks at each link on a page (or in all the pages in a category) and prints the links to
4# images that are external to the wiki. Distinction is made between images hosted on oni2.net
5# and on third-party domains. You must pass in one or both of the following args:
6# -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
7# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
8#
9# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
10#
11# Recommended viewing width:
12# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
13
14import os
15
16from urllib.parse import urljoin
17
18import pywikibot
19from pywikibot.bot import QuitKeyboardInterrupt
20from pywikibot import pagegenerators
21from pywikibot.comms.http import fetch
22from pywikibot.specialbots import UploadRobot
23
24import bs4
25from bs4 import BeautifulSoup
26
27# Initialize globals
28debug = 0
29pages_checked = 0
30page_errors = 0
31image_errors = 0
32linked_ext_images = 0
33linked_oni2_images = 0
34embedded_ext_images = 0
35embedded_oni2_images = 0
36file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
37tag_names = []
38
39# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
40def plural_check(noun, quantity):
41 if quantity != 1:
42 return noun + "s"
43 else:
44 return noun
45
46# Scrapes the HTML at the given URL for image tags
47def get_image_links(page_url, page_name):
48 global debug
49 global pages_checked
50 global page_errors
51 global image_errors
52 global linked_ext_images
53 global linked_oni2_images
54 global embedded_ext_images
55 global embedded_oni2_images
56 global file_formats
57 global tag_names
58 name_printed = 0
59
60 response = fetch(page_url)
61 if response.status_code != 200:
62 pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url))
63 page_errors += 1
64 return
65
66 soup = BeautifulSoup(response.text, 'html.parser')
67 pages_checked += 1
68 for tag in soup.findAll(tag_names):
69 link = tag.get('href')
70 if not link:
71 link = tag.get('src')
72
73 # Filter out empty links
74 if not link:
75 if tag.get('id') == "top":
76 continue
77
78 class_names = tag.get('class')
79 if "selflink" in class_names:
80 continue
81
82 if not name_printed and not debug:
83 pywikibot.stdout('From page "{}":'.format(page_name))
84 name_printed = 1
85 pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text))
86 pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
87 page_errors += 1
88 continue
89
90 # A "src" or "href" starting with "/" would be a link to a local page or file; a
91 # link starting with "#" is a section link
92 if link.startswith('/') or link.startswith('#'):
93 continue
94
95 # The gnu.org link to the Free Documentation License is at the bottom of every page
96 if link == "http://www.gnu.org/copyleft/fdl.html":
97 continue
98
99 # Determine if link is to an image
100 _, ext = os.path.splitext(link)
101 if ext.lower() in file_formats:
102 if not name_printed and not debug:
103 pywikibot.stdout('Found on page "{}":'.format(page_name))
104 name_printed = 1
105 tag_text = format(tag)
106 if "oni2.net" in link:
107 if tag_text.startswith('<a'):
108 pywikibot.stdout(' Linked oni2.net image: {}'.format(link))
109 linked_oni2_images += 1
110 elif tag_text.startswith('<img'):
111 pywikibot.stdout(' Embedded oni2.net image: {}'.format(link))
112 embedded_oni2_images += 1
113 else:
114 pywikibot.stdout(' ERROR: Could not process oni2.net image link {}.'.format(link))
115 image_errors += 1
116 return
117 else:
118 if tag_text.startswith('<a'):
119 pywikibot.stdout(' Linked external image: {}'.format(link))
120 linked_ext_images += 1
121 elif tag_text.startswith('<img'):
122 pywikibot.stdout(' Embedded external image: {}'.format(link))
123 embedded_ext_images += 1
124 else:
125 pywikibot.stdout(' ERROR: Could not process external image link {}.'.format(link))
126 image_errors += 1
127 return
128
129def main(*args):
130 global debug
131 global pages_checked
132 global page_errors
133 global image_errors
134 global linked_ext_images
135 global linked_oni2_images
136 global embedded_ext_images
137 global embedded_oni2_images
138 global tag_names
139
140 search_cat = ''
141 search_page = ''
142
143 #pywikibot.stdout('The members of the bs4.element.Tag class are:')
144 #pywikibot.stdout(format(dir(bs4.element.Tag)))
145
146 local_args = pywikibot.handle_args(args)
147 genFactory = pagegenerators.GeneratorFactory()
148
149 for arg in local_args:
150 if arg.startswith('-cat:'):
151 search_cat = arg[5:]
152 elif arg.startswith('-page:'):
153 search_page = arg[6:]
154 elif arg == '-linked':
155 tag_names += ['a']
156 elif arg == '-embedded':
157 tag_names += ['img']
158 elif arg == '-dbg':
159 debug = 1
160 else:
161 pywikibot.stdout('Unknown argument "{}".'.format(arg))
162 return
163
164 if not tag_names:
165 pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
166 return
167
168 site = pywikibot.Site()
169 if search_cat != '':
170 cat_obj = pywikibot.Category(site, search_cat)
171 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
172 for page in pagegenerators.PreloadingGenerator(generator, 100):
173 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
174 page_url = page.full_url().replace("%2F", "/")
175 get_image_links(page_url, page.title())
176 elif search_page != '':
177 page = pywikibot.Page(site, search_page)
178 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
179 page_url = page.full_url().replace("%2F", "/")
180 get_image_links(page_url, page.title())
181 else:
182 pywikibot.stdout('No page name or category name received.'.format(arg))
183 return
184
185 chk_page_str = plural_check("page", pages_checked)
186 err_page_str = plural_check("page", page_errors)
187 err_img_str = plural_check("image", image_errors)
188 linked_ext_image_str = plural_check("image", linked_ext_images)
189 linked_oni2_image_str = plural_check("image", linked_oni2_images)
190 embedded_ext_image_str = plural_check("image", embedded_ext_images)
191 embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
192
193 pywikibot.stdout('-------------------------')
194 pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
195 if 'a' in tag_names:
196 pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
197 if 'img' in tag_names:
198 pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
199
200if __name__ == '__main__':
201 main()
Note: See TracBrowser for help on using the repository browser.