source: ValBot/Python/find_external_images.py@ 1174

Last change on this file since 1174 was 1173, checked in by iritscen, 2 years ago

ValBot: check_intrawiki_section_links.py won't quit when a link cannot be understood; it will just move on. find_external_images.py is now polished and robust.

File size: 5.3 KB
Line 
1# Find External Images
2# by iritscen@yahoo.com
3# Looks at each link on a page (or in all the pages in a category) and prints the links to
4# images that are externally-hosted. You must pass in one or both of the following args:
5# -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
6# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
7# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
8#
9# Recommended viewing width:
10# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
11
12import os
13
14from urllib.parse import urljoin
15
16import pywikibot
17
18from pywikibot.bot import QuitKeyboardInterrupt
19from pywikibot import pagegenerators
20from pywikibot.comms.http import fetch
21from pywikibot.specialbots import UploadRobot
22#import bs4 # for listing members with dir()
23from bs4 import BeautifulSoup
24
25pages_checked = 0
26page_errors = 0
27ext_images = 0
28oni2_images = 0
29file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
30tag_names = []
31
32# Scrapes the HTML at the given URL for image tags
33def get_image_links(url):
34 global pages_checked
35 global page_errors
36 global ext_images
37 global oni2_images
38 global file_formats
39 global tag_names
40
41 response = fetch(url)
42 if response.status_code != 200:
43 pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
44 page_errors = page_errors + 1
45 return
46
47 soup = BeautifulSoup(response.text, 'html.parser')
48 pages_checked = pages_checked + 1
49 for tag in soup.findAll(tag_names):
50 link = tag.get('href')
51 if not link:
52 link = tag.get('src')
53
54 # Filter out empty links
55 if not link:
56 if tag.get('id') == "top":
57 continue
58
59 class_names = tag.get('class')
60 if "selflink" in class_names:
61 continue
62
63 pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
64 pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
65 continue
66
67 # A "src" or "href" starting with "/" would be a link to a local page or file; a
68 # link starting with "#" is a section link
69 if link.startswith('/') or link.startswith('#'):
70 continue
71
72 # The gnu.org link to the Free Documentation License is at the bottom of every page
73 if link == "http://www.gnu.org/copyleft/fdl.html":
74 continue
75
76 _, ext = os.path.splitext(link)
77 if ext.lower() in file_formats:
78 if "oni2.net" in link:
79 pywikibot.stdout(' Oni2.net image: {}'.format(link))
80 oni2_images = oni2_images + 1
81 else:
82 pywikibot.stdout(' External image: {}'.format(link))
83 ext_images = ext_images + 1
84 #else:
85 #pywikibot.stdout(' Other external link: {}'.format(link))
86
87def main(*args):
88 global pages_checked
89 global page_errors
90 global ext_images
91 global oni2_images
92 global tag_names
93
94 cat_name = ''
95 page_name = ''
96
97 #pywikibot.stdout('The members of the bs4.element.Tag class are:')
98 #pywikibot.stdout(format(dir(bs4.element.Tag)))
99
100 local_args = pywikibot.handle_args(args)
101 genFactory = pagegenerators.GeneratorFactory()
102
103 for arg in local_args:
104 if arg.startswith('-cat:'):
105 cat_name = arg[5:]
106 elif arg.startswith('-page:'):
107 page_name = arg[6:]
108 elif arg == '-linked':
109 tag_names += ['a']
110 elif arg == '-inlined':
111 tag_names += ['img']
112 else:
113 pywikibot.stdout('Unknown argument "{}".'.format(arg))
114 return
115
116 if not tag_names:
117 pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
118 return
119
120 site = pywikibot.Site()
121 if cat_name != '':
122 cat_obj = pywikibot.Category(site, cat_name)
123 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
124 for page in pagegenerators.PreloadingGenerator(generator, 100):
125 pywikibot.stdout('Checking page "{}"'.format(page.title()))
126 page_url = page.full_url().replace("%2F", "/")
127 get_image_links(page_url)
128 elif page_name != '':
129 page = pywikibot.Page(site, page_name)
130 pywikibot.stdout('Checking page "{}"'.format(page.title()))
131 page_url = page.full_url().replace("%2F", "/")
132 get_image_links(page_url)
133 else:
134 pywikibot.stdout('No page name or category name received.'.format(arg))
135 return
136
137 chk_page_str = "pages"
138 if pages_checked == 1:
139 chk_page_str = "page"
140
141 err_page_str = "pages"
142 if page_errors == 1:
143 err_page_str = "page"
144
145 ext_image_str = "images"
146 if ext_images == 1:
147 ext_image_str = "image"
148
149 oni2_image_str = "images"
150 if oni2_images == 1:
151 oni2_image_str = "image"
152
153 pywikibot.stdout('-------------------------')
154 pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
155 pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
156
157if __name__ == '__main__':
158 main()
Note: See TracBrowser for help on using the repository browser.