Changeset 1173
- Timestamp:
- Jun 29, 2022, 12:06:29 AM (3 years ago)
- Location:
- ValBot
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_intrawiki_section_links.py
r1171 r1173 83 83 if link_text.startswith('/'): 84 84 link_text = page_name + link_text 85 pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))85 #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text)) 86 86 87 87 # If this is a relative "../" link, find the parent page and set ourselves to that page, … … 144 144 # If we still haven't turned this match into a URL, something's gone wrong 145 145 if (found_iw_match == False) or (iw_url == ""): 146 pywikibot.stdout('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))147 quit()146 pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text)) 147 continue 148 148 149 149 # Test the URL -
ValBot/Python/find_external_images.py
r1169 r1173 1 # Find External Images 2 # by iritscen@yahoo.com 3 # Looks at each link on a page (or in all the pages in a category) and prints the links to 4 # images that are externally-hosted. You must pass in one or both of the following args: 5 # -inlined: Show any plain URLs leading to images (these create embedded images, <img>) 6 # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>) 7 # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". 8 # 9 # Recommended viewing width: 10 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| 11 1 12 import os 2 13 … … 9 20 from pywikibot.comms.http import fetch 10 21 from pywikibot.specialbots import UploadRobot 22 #import bs4 # for listing members with dir() 11 23 from bs4 import BeautifulSoup 12 24 13 first_run = False14 25 pages_checked = 0 26 page_errors = 0 27 ext_images = 0 15 28 oni2_images = 0 16 29 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') 30 tag_names = [] 17 31 18 32 # Scrapes the HTML at the given URL for image tags 19 def get_image_links(url, shown): 20 links = [] 33 def get_image_links(url): 34 global pages_checked 35 global page_errors 36 global ext_images 21 37 global oni2_images 22 global pages_checked 38 global file_formats 39 global tag_names 23 40 24 41 response = fetch(url) 25 42 if response.status_code != 200: 26 pywikibot.output('Skipping url: {}'.format(url)) 27 return links 43 pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url)) 44 page_errors = page_errors + 1 45 return 28 46 29 47 soup = BeautifulSoup(response.text, 'html.parser') 30 48 pages_checked = pages_checked + 1 31 if not shown: 32 tagname = 'a' 33 elif shown == 'just': 34 tagname = 'img' 35 else: 36 tagname = ['a', 'img'] 37 #pywikibot.output('Looking at tags.') 38 for tag in soup.findAll(tagname): 39 link = tag.get('src', tag.get('href', None)) 49 for tag in soup.findAll(tag_names): 50 link = tag.get('href') 40 51 if not link: 41 #pywikibot.output('It is not a link.') 52 link = tag.get('src') 53 54 # Filter out empty links 55 if not link: 56 if tag.get('id') == "top": 57 continue 58 59 class_names = tag.get('class') 60 if "selflink" in class_names: 61 continue 62 63 pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text)) 64 pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) 42 65 continue 43 #pywikibot.output('Got link {0}.'.format(link)) 66 67 # A "src" or "href" starting with "/" would be a link to a local page or file; a 68 # link starting with "#" is a section link 69 if link.startswith('/') or link.startswith('#'): 70 continue 71 72 # The gnu.org link to the Free Documentation License is at the bottom of every page 73 if link == "http://www.gnu.org/copyleft/fdl.html": 74 continue 75 44 76 _, ext = os.path.splitext(link) 45 77 if ext.lower() in file_formats: 46 pywikibot.output('Found image link {0}.'.format(ext))47 78 if "oni2.net" in link: 48 pywikibot.stdout(' Found an oni2.net image: {0}'.format(link))79 pywikibot.stdout(' Oni2.net image: {}'.format(link)) 49 80 oni2_images = oni2_images + 1 50 return links 51 81 else: 82 pywikibot.stdout(' External image: {}'.format(link)) 83 ext_images = ext_images + 1 84 #else: 85 #pywikibot.stdout(' Other external link: {}'.format(link)) 52 86 53 87 def main(*args): 54 cat = '' 55 url = '' 56 image_url = False 57 shown = False 58 desc = [] 88 global pages_checked 89 global page_errors 90 global ext_images 91 global oni2_images 92 global tag_names 93 94 cat_name = '' 95 page_name = '' 96 97 #pywikibot.stdout('The members of the bs4.element.Tag class are:') 98 #pywikibot.stdout(format(dir(bs4.element.Tag))) 59 99 60 100 local_args = pywikibot.handle_args(args) … … 63 103 for arg in local_args: 64 104 if arg.startswith('-cat:'): 65 cat = arg[5:]66 elif arg == '-shown':67 shown = True68 elif arg == '- justshown':69 shown = 'just'70 elif url == '':71 url = arg105 cat_name = arg[5:] 106 elif arg.startswith('-page:'): 107 page_name = arg[6:] 108 elif arg == '-linked': 109 tag_names += ['a'] 110 elif arg == '-inlined': 111 tag_names += ['img'] 72 112 else: 73 desc += [arg] 74 desc = ' '.join(desc) 113 pywikibot.stdout('Unknown argument "{}".'.format(arg)) 114 return 115 116 if not tag_names: 117 pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.') 118 return 75 119 76 120 site = pywikibot.Site() 77 cat_obj = pywikibot.Category(site, cat) 78 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) 79 for page in pagegenerators.PreloadingGenerator(generator, 100): 80 pywikibot.stdout('Checking page {0}'.format(page.title())) 121 if cat_name != '': 122 cat_obj = pywikibot.Category(site, cat_name) 123 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) 124 for page in pagegenerators.PreloadingGenerator(generator, 100): 125 pywikibot.stdout('Checking page "{}"'.format(page.title())) 126 page_url = page.full_url().replace("%2F", "/") 127 get_image_links(page_url) 128 elif page_name != '': 129 page = pywikibot.Page(site, page_name) 130 pywikibot.stdout('Checking page "{}"'.format(page.title())) 81 131 page_url = page.full_url().replace("%2F", "/") 82 get_image_links(page_url, shown) 132 get_image_links(page_url) 133 else: 134 pywikibot.stdout('No page name or category name received.'.format(arg)) 135 return 83 136 84 global pages_checked 85 global oni2_images 86 pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images)) 137 chk_page_str = "pages" 138 if pages_checked == 1: 139 chk_page_str = "page" 140 141 err_page_str = "pages" 142 if page_errors == 1: 143 err_page_str = "page" 144 145 ext_image_str = "images" 146 if ext_images == 1: 147 ext_image_str = "image" 148 149 oni2_image_str = "images" 150 if oni2_images == 1: 151 oni2_image_str = "image" 152 153 pywikibot.stdout('-------------------------') 154 pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str)) 155 pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str)) 87 156 88 157 if __name__ == '__main__':
Note:
See TracChangeset
for help on using the changeset viewer.