source: ValBot/Python/find_external_images.py@ 1188

Last change on this file since 1188 was 1181, checked in by iritscen, 19 months ago

ValBot: find_external_images.py: Changed argument "-inlined" to "-embedded". Now clearly distinguishing linked from embedded images. Placed some output under a "-dbg" argument.

File size: 7.3 KB
RevLine 
[1173]1# Find External Images
2# by iritscen@yahoo.com
3# Looks at each link on a page (or in all the pages in a category) and prints the links to
[1181]4# images that are external to the wiki. Distinction is made between images hosted on oni2.net
5# and on third-party domains. You must pass in one or both of the following args:
6# -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
7# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
8#
[1173]9# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
10#
11# Recommended viewing width:
12# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
13
[1169]14import os
15
16from urllib.parse import urljoin
17
18import pywikibot
19from pywikibot.bot import QuitKeyboardInterrupt
20from pywikibot import pagegenerators
21from pywikibot.comms.http import fetch
22from pywikibot.specialbots import UploadRobot
[1181]23
24import bs4
[1169]25from bs4 import BeautifulSoup
26
[1181]27# Initialize globals
28debug = 0
[1169]29pages_checked = 0
[1173]30page_errors = 0
[1181]31image_errors = 0
32linked_ext_images = 0
33linked_oni2_images = 0
34embedded_ext_images = 0
35embedded_oni2_images = 0
[1169]36file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
[1173]37tag_names = []
[1169]38
[1181]39# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
40def plural_check(noun, quantity):
41 if quantity != 1:
42 return noun + "s"
43 else:
44 return noun
45
[1169]46# Scrapes the HTML at the given URL for image tags
[1181]47def get_image_links(page_url, page_name):
48 global debug
49 global pages_checked
50 global page_errors
51 global image_errors
52 global linked_ext_images
53 global linked_oni2_images
54 global embedded_ext_images
55 global embedded_oni2_images
56 global file_formats
57 global tag_names
58 name_printed = 0
[1169]59
[1181]60 response = fetch(page_url)
61 if response.status_code != 200:
62 pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url))
63 page_errors += 1
64 return
[1169]65
[1181]66 soup = BeautifulSoup(response.text, 'html.parser')
67 pages_checked += 1
68 for tag in soup.findAll(tag_names):
69 link = tag.get('href')
70 if not link:
71 link = tag.get('src')
[1173]72
[1181]73 # Filter out empty links
74 if not link:
75 if tag.get('id') == "top":
[1169]76 continue
[1173]77
[1181]78 class_names = tag.get('class')
79 if "selflink" in class_names:
[1173]80 continue
81
[1181]82 if not name_printed and not debug:
83 pywikibot.stdout('From page "{}":'.format(page_name))
84 name_printed = 1
85 pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text))
86 pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
87 page_errors += 1
88 continue
[1173]89
[1181]90 # A "src" or "href" starting with "/" would be a link to a local page or file; a
91 # link starting with "#" is a section link
92 if link.startswith('/') or link.startswith('#'):
93 continue
94
95 # The gnu.org link to the Free Documentation License is at the bottom of every page
96 if link == "http://www.gnu.org/copyleft/fdl.html":
97 continue
98
99 # Determine if link is to an image
100 _, ext = os.path.splitext(link)
101 if ext.lower() in file_formats:
102 if not name_printed and not debug:
103 pywikibot.stdout('Found on page "{}":'.format(page_name))
104 name_printed = 1
105 tag_text = format(tag)
106 if "oni2.net" in link:
107 if tag_text.startswith('<a'):
108 pywikibot.stdout(' Linked oni2.net image: {}'.format(link))
109 linked_oni2_images += 1
110 elif tag_text.startswith('<img'):
111 pywikibot.stdout(' Embedded oni2.net image: {}'.format(link))
112 embedded_oni2_images += 1
[1173]113 else:
[1181]114 pywikibot.stdout(' ERROR: Could not process oni2.net image link {}.'.format(link))
115 image_errors += 1
116 return
117 else:
118 if tag_text.startswith('<a'):
119 pywikibot.stdout(' Linked external image: {}'.format(link))
120 linked_ext_images += 1
121 elif tag_text.startswith('<img'):
122 pywikibot.stdout(' Embedded external image: {}'.format(link))
123 embedded_ext_images += 1
124 else:
125 pywikibot.stdout(' ERROR: Could not process external image link {}.'.format(link))
126 image_errors += 1
127 return
[1169]128
129def main(*args):
[1181]130 global debug
131 global pages_checked
132 global page_errors
133 global image_errors
134 global linked_ext_images
135 global linked_oni2_images
136 global embedded_ext_images
137 global embedded_oni2_images
138 global tag_names
[1169]139
[1181]140 search_cat = ''
141 search_page = ''
[1173]142
[1181]143 #pywikibot.stdout('The members of the bs4.element.Tag class are:')
144 #pywikibot.stdout(format(dir(bs4.element.Tag)))
[1173]145
[1181]146 local_args = pywikibot.handle_args(args)
147 genFactory = pagegenerators.GeneratorFactory()
[1169]148
[1181]149 for arg in local_args:
150 if arg.startswith('-cat:'):
151 search_cat = arg[5:]
152 elif arg.startswith('-page:'):
153 search_page = arg[6:]
154 elif arg == '-linked':
155 tag_names += ['a']
156 elif arg == '-embedded':
157 tag_names += ['img']
158 elif arg == '-dbg':
159 debug = 1
160 else:
161 pywikibot.stdout('Unknown argument "{}".'.format(arg))
162 return
[1169]163
[1181]164 if not tag_names:
165 pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
166 return
[1173]167
[1181]168 site = pywikibot.Site()
169 if search_cat != '':
170 cat_obj = pywikibot.Category(site, search_cat)
171 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
172 for page in pagegenerators.PreloadingGenerator(generator, 100):
173 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
174 page_url = page.full_url().replace("%2F", "/")
175 get_image_links(page_url, page.title())
176 elif search_page != '':
177 page = pywikibot.Page(site, search_page)
178 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
179 page_url = page.full_url().replace("%2F", "/")
180 get_image_links(page_url, page.title())
181 else:
182 pywikibot.stdout('No page name or category name received.'.format(arg))
183 return
[1169]184
[1181]185 chk_page_str = plural_check("page", pages_checked)
186 err_page_str = plural_check("page", page_errors)
187 err_img_str = plural_check("image", image_errors)
188 linked_ext_image_str = plural_check("image", linked_ext_images)
189 linked_oni2_image_str = plural_check("image", linked_oni2_images)
190 embedded_ext_image_str = plural_check("image", embedded_ext_images)
191 embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
[1169]192
[1181]193 pywikibot.stdout('-------------------------')
194 pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
195 if 'a' in tag_names:
196 pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
197 if 'img' in tag_names:
198 pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
[1173]199
[1169]200if __name__ == '__main__':
[1181]201 main()
Note: See TracBrowser for help on using the repository browser.