Changeset 1181
- Timestamp:
- Apr 28, 2023, 2:55:00 AM (20 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/find_external_images.py
r1173 r1181 2 2 # by iritscen@yahoo.com 3 3 # Looks at each link on a page (or in all the pages in a category) and prints the links to 4 # images that are externally-hosted. You must pass in one or both of the following args: 5 # -inlined: Show any plain URLs leading to images (these create embedded images, <img>) 6 # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>) 4 # images that are external to the wiki. Distinction is made between images hosted on oni2.net 5 # and on third-party domains. You must pass in one or both of the following args: 6 # -embedded: Show any plain URLs leading to images (these create embedded images, <img>) 7 # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>) 8 # 7 9 # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". 8 10 # … … 15 17 16 18 import pywikibot 17 18 19 from pywikibot.bot import QuitKeyboardInterrupt 19 20 from pywikibot import pagegenerators 20 21 from pywikibot.comms.http import fetch 21 22 from pywikibot.specialbots import UploadRobot 22 #import bs4 # for listing members with dir() 23 24 import bs4 23 25 from bs4 import BeautifulSoup 24 26 27 # Initialize globals 28 debug = 0 25 29 pages_checked = 0 26 30 page_errors = 0 27 ext_images = 0 28 oni2_images = 0 31 image_errors = 0 32 linked_ext_images = 0 33 linked_oni2_images = 0 34 embedded_ext_images = 0 35 embedded_oni2_images = 0 29 36 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') 30 37 tag_names = [] 31 38 39 # Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1 40 def plural_check(noun, quantity): 41 if quantity != 1: 42 return noun + "s" 43 else: 44 return noun 45 32 46 # Scrapes the HTML at the given URL for image tags 33 def get_image_links(url): 34 global pages_checked 35 global page_errors 36 global ext_images 37 global oni2_images 38 global file_formats 39 global tag_names 40 41 response = fetch(url) 42 if response.status_code != 200: 43 pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url)) 44 page_errors = page_errors + 1 45 return 46 47 soup = BeautifulSoup(response.text, 'html.parser') 48 pages_checked = pages_checked + 1 49 for tag in soup.findAll(tag_names): 50 link = tag.get('href') 51 if not link: 52 link = tag.get('src') 53 54 # Filter out empty links 55 if not link: 56 if tag.get('id') == "top": 57 continue 58 59 class_names = tag.get('class') 60 if "selflink" in class_names: 61 continue 62 63 pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text)) 64 pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) 47 def get_image_links(page_url, page_name): 48 global debug 49 global pages_checked 50 global page_errors 51 global image_errors 52 global linked_ext_images 53 global linked_oni2_images 54 global embedded_ext_images 55 global embedded_oni2_images 56 global file_formats 57 global tag_names 58 name_printed = 0 59 60 response = fetch(page_url) 61 if response.status_code != 200: 62 pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url)) 63 page_errors += 1 64 return 65 66 soup = BeautifulSoup(response.text, 'html.parser') 67 pages_checked += 1 68 for tag in soup.findAll(tag_names): 69 link = tag.get('href') 70 if not link: 71 link = tag.get('src') 72 73 # Filter out empty links 74 if not link: 75 if tag.get('id') == "top": 65 76 continue 66 77 67 # A "src" or "href" starting with "/" would be a link to a local page or file; a 68 # link starting with "#" is a section link 69 if link.startswith('/') or link.startswith('#'): 78 class_names = tag.get('class') 79 if "selflink" in class_names: 70 80 continue 71 81 72 # The gnu.org link to the Free Documentation License is at the bottom of every page 73 if link == "http://www.gnu.org/copyleft/fdl.html": 74 continue 75 76 _, ext = os.path.splitext(link) 77 if ext.lower() in file_formats: 78 if "oni2.net" in link: 79 pywikibot.stdout(' Oni2.net image: {}'.format(link)) 80 oni2_images = oni2_images + 1 82 if not name_printed and not debug: 83 pywikibot.stdout('From page "{}":'.format(page_name)) 84 name_printed = 1 85 pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text)) 86 pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) 87 page_errors += 1 88 continue 89 90 # A "src" or "href" starting with "/" would be a link to a local page or file; a 91 # link starting with "#" is a section link 92 if link.startswith('/') or link.startswith('#'): 93 continue 94 95 # The gnu.org link to the Free Documentation License is at the bottom of every page 96 if link == "http://www.gnu.org/copyleft/fdl.html": 97 continue 98 99 # Determine if link is to an image 100 _, ext = os.path.splitext(link) 101 if ext.lower() in file_formats: 102 if not name_printed and not debug: 103 pywikibot.stdout('Found on page "{}":'.format(page_name)) 104 name_printed = 1 105 tag_text = format(tag) 106 if "oni2.net" in link: 107 if tag_text.startswith('<a'): 108 pywikibot.stdout(' Linked oni2.net image: {}'.format(link)) 109 linked_oni2_images += 1 110 elif tag_text.startswith('<img'): 111 pywikibot.stdout(' Embedded oni2.net image: {}'.format(link)) 112 embedded_oni2_images += 1 81 113 else: 82 pywikibot.stdout(' External image: {}'.format(link)) 83 ext_images = ext_images + 1 84 #else: 85 #pywikibot.stdout(' Other external link: {}'.format(link)) 114 pywikibot.stdout(' ERROR: Could not process oni2.net image link {}.'.format(link)) 115 image_errors += 1 116 return 117 else: 118 if tag_text.startswith('<a'): 119 pywikibot.stdout(' Linked external image: {}'.format(link)) 120 linked_ext_images += 1 121 elif tag_text.startswith('<img'): 122 pywikibot.stdout(' Embedded external image: {}'.format(link)) 123 embedded_ext_images += 1 124 else: 125 pywikibot.stdout(' ERROR: Could not process external image link {}.'.format(link)) 126 image_errors += 1 127 return 86 128 87 129 def main(*args): 88 global pages_checked 89 global page_errors 90 global ext_images 91 global oni2_images 92 global tag_names 93 94 cat_name = '' 95 page_name = '' 96 97 #pywikibot.stdout('The members of the bs4.element.Tag class are:') 98 #pywikibot.stdout(format(dir(bs4.element.Tag))) 99 100 local_args = pywikibot.handle_args(args) 101 genFactory = pagegenerators.GeneratorFactory() 102 103 for arg in local_args: 104 if arg.startswith('-cat:'): 105 cat_name = arg[5:] 106 elif arg.startswith('-page:'): 107 page_name = arg[6:] 108 elif arg == '-linked': 109 tag_names += ['a'] 110 elif arg == '-inlined': 111 tag_names += ['img'] 112 else: 113 pywikibot.stdout('Unknown argument "{}".'.format(arg)) 114 return 115 116 if not tag_names: 117 pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.') 118 return 119 120 site = pywikibot.Site() 121 if cat_name != '': 122 cat_obj = pywikibot.Category(site, cat_name) 123 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) 124 for page in pagegenerators.PreloadingGenerator(generator, 100): 125 pywikibot.stdout('Checking page "{}"'.format(page.title())) 126 page_url = page.full_url().replace("%2F", "/") 127 get_image_links(page_url) 128 elif page_name != '': 129 page = pywikibot.Page(site, page_name) 130 pywikibot.stdout('Checking page "{}"'.format(page.title())) 131 page_url = page.full_url().replace("%2F", "/") 132 get_image_links(page_url) 133 else: 134 pywikibot.stdout('No page name or category name received.'.format(arg)) 135 return 136 137 chk_page_str = "pages" 138 if pages_checked == 1: 139 chk_page_str = "page" 140 141 err_page_str = "pages" 142 if page_errors == 1: 143 err_page_str = "page" 144 145 ext_image_str = "images" 146 if ext_images == 1: 147 ext_image_str = "image" 148 149 oni2_image_str = "images" 150 if oni2_images == 1: 151 oni2_image_str = "image" 152 153 pywikibot.stdout('-------------------------') 154 pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str)) 155 pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str)) 130 global debug 131 global pages_checked 132 global page_errors 133 global image_errors 134 global linked_ext_images 135 global linked_oni2_images 136 global embedded_ext_images 137 global embedded_oni2_images 138 global tag_names 139 140 search_cat = '' 141 search_page = '' 142 143 #pywikibot.stdout('The members of the bs4.element.Tag class are:') 144 #pywikibot.stdout(format(dir(bs4.element.Tag))) 145 146 local_args = pywikibot.handle_args(args) 147 genFactory = pagegenerators.GeneratorFactory() 148 149 for arg in local_args: 150 if arg.startswith('-cat:'): 151 search_cat = arg[5:] 152 elif arg.startswith('-page:'): 153 search_page = arg[6:] 154 elif arg == '-linked': 155 tag_names += ['a'] 156 elif arg == '-embedded': 157 tag_names += ['img'] 158 elif arg == '-dbg': 159 debug = 1 160 else: 161 pywikibot.stdout('Unknown argument "{}".'.format(arg)) 162 return 163 164 if not tag_names: 165 pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.') 166 return 167 168 site = pywikibot.Site() 169 if search_cat != '': 170 cat_obj = pywikibot.Category(site, search_cat) 171 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) 172 for page in pagegenerators.PreloadingGenerator(generator, 100): 173 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title())) 174 page_url = page.full_url().replace("%2F", "/") 175 get_image_links(page_url, page.title()) 176 elif search_page != '': 177 page = pywikibot.Page(site, search_page) 178 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title())) 179 page_url = page.full_url().replace("%2F", "/") 180 get_image_links(page_url, page.title()) 181 else: 182 pywikibot.stdout('No page name or category name received.'.format(arg)) 183 return 184 185 chk_page_str = plural_check("page", pages_checked) 186 err_page_str = plural_check("page", page_errors) 187 err_img_str = plural_check("image", image_errors) 188 linked_ext_image_str = plural_check("image", linked_ext_images) 189 linked_oni2_image_str = plural_check("image", linked_oni2_images) 190 embedded_ext_image_str = plural_check("image", embedded_ext_images) 191 embedded_oni2_image_str = plural_check("image", embedded_oni2_images) 192 193 pywikibot.stdout('-------------------------') 194 pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str)) 195 if 'a' in tag_names: 196 pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str)) 197 if 'img' in tag_names: 198 pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str)) 156 199 157 200 if __name__ == '__main__': 158 201 main()
Note:
See TracChangeset
for help on using the changeset viewer.