Changeset 1170
- Timestamp:
- Mar 21, 2022, 10:22:33 PM (3 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_interwiki_links.py
r1169 r1170 1 # Check Interwiki Links 2 # by iritscen@yahoo.com 3 # Looks at each link on a page (or in all the pages in a category) which uses a registered 4 # interwiki prefix and loads the linked page, verifying that it exists and that any section 5 # link, if present, is valid as well. The output will use the word "ERROR" when it cannot 6 # validate the interwiki link. 7 # Recommended viewing width: 8 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| 9 1 10 import os 2 11 … … 5 14 import pywikibot 6 15 import re 16 import requests 7 17 8 18 from pywikibot.bot import QuitKeyboardInterrupt … … 20 30 pages_checked = 0 21 31 iw_found = 0 22 problems_found = 032 errors_issued = 0 23 33 24 34 # Searches the given page text for interwiki links … … 26 36 global pages_checked 27 37 global iw_found 28 global problems_found38 global errors_issued 29 39 pages_checked = pages_checked + 1 30 40 cur = 0 … … 43 53 # Construct full URL for the particular wiki 44 54 iw_url = interwiki_urls[cur] + page_title 45 pywikibot. output('Found {0} link {1}.'.format(prefix, page_title))55 pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title)) 46 56 iw_found = iw_found + 1 47 57 … … 55 65 56 66 # Test the URL 57 #pywikibot. output('Testing URL {}...'.format(iw_url))67 #pywikibot.stdout(' Testing URL "{}"'.format(iw_url)) 58 68 response = fetch(iw_url) 59 69 … … 61 71 # way we tell that a redirect occurred is by checking the history 62 72 if response.history != []: 63 pywikibot. output('WARNING: Redirected from {}.'.format(response.history))64 problems_found = problems_found + 173 pywikibot.stdout(' ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url)) 74 errors_issued = errors_issued + 1 65 75 elif response.status_code != 200: 66 #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg67 problems_found = problems_found + 176 pywikibot.stdout(' ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url)) 77 errors_issued = errors_issued + 1 68 78 elif '#' in page_title: 69 79 # Isolate section link 70 pywikibot.output('Detected section link on page {0}.'.format(page_title))71 80 page_name, anchor_name = page_title.split('#') 72 81 … … 80 89 soup = BeautifulSoup(response.text, 'html.parser') 81 90 found_section = False 82 for tag in soup.findAll('a'): 83 link = tag.get('href', None) 84 if not link: 85 #pywikibot.output('It is not a link.') 86 continue 87 #pywikibot.output('Got link {0}.'.format(link)) 88 if not link.startswith('#'): 89 continue 90 91 if link == '#' + anchor_name: 92 pywikibot.output('Found section link!') 91 for span_tag in soup.findAll('span'): 92 span_name = span_tag.get('id', None) 93 if span_name == anchor_name: 94 #pywikibot.stdout('Found section!') 93 95 found_section = True 94 96 break 95 97 if found_section == False: 96 pywikibot. output('Could not find section {0} on page {1}.'.format(anchor_name, page_name))97 problems_found = problems_found + 198 pywikibot.stdout(' ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name)) 99 errors_issued = errors_issued + 1 98 100 cur = cur + 1 99 101 … … 113 115 site = pywikibot.Site() 114 116 115 # This line of code enumerates the methods in the 'page' class116 #pywikibot.stdout(format(dir( page)))117 #pywikibot.stdout('The members of the requests.models.Response class are:') 118 #pywikibot.stdout(format(dir(requests.models.Response))) 117 119 118 120 if cat_name != '': … … 120 122 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) 121 123 for page in pagegenerators.PreloadingGenerator(generator, 100): 122 pywikibot.stdout('Checking page {0}'.format(page.title()))124 pywikibot.stdout('Checking page "{}"'.format(page.title())) 123 125 scan_for_iw_links(page.text) 124 126 elif page_name != '': 125 127 page = pywikibot.Page(site, page_name) 126 pywikibot.stdout('Checking page {0}'.format(page.title()))128 pywikibot.stdout('Checking page "{}"'.format(page.title())) 127 129 scan_for_iw_links(page.text) 128 130 129 131 global pages_checked 130 132 global iw_found 131 global problems_found 132 pywikibot.stdout('Checked {0} page(s) and found {1} interwiki link(s) with {2} problem(s).'.format(pages_checked, iw_found, problems_found)) 133 global errors_issued 134 135 page_str = "pages" 136 if pages_checked == 1: 137 page_str = "page" 138 139 link_str = "links" 140 if iw_found == 1: 141 link_str = "link" 142 143 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 144 145 error_str = "errors were" 146 if errors_issued == 1: 147 error_str = "error was" 148 149 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str)) 133 150 134 151 if __name__ == '__main__':
Note:
See TracChangeset
for help on using the changeset viewer.