Changeset 1180
- Timestamp:
- Apr 28, 2023, 2:54:21 AM (20 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_interwiki_links.py
r1174 r1180 28 28 interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/') 29 29 30 # Initialize globals 31 debug = 0 30 32 pages_checked = 0 31 33 iw_found = 0 … … 33 35 34 36 # Searches the given page text for interwiki links 35 def scan_for_iw_links(page_text): 36 global pages_checked 37 global iw_found 38 global errors_issued 39 pages_checked = pages_checked + 1 40 cur = 0 41 42 for prefix in interwiki_prefixes: 43 # Isolate strings that start with "[[prefix:" and end with "|" or "]" 44 iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])" 45 for match in re.finditer(iw_link, page_text): 46 # Extract just the page title from this regex match 47 s = match.start() + 2 + len(prefix) + 1 48 e = match.end() - 1 49 50 # Sometimes we used a space char. instead of a '_', so fix that before querying 51 page_title = page_text[s:e].replace(' ', '_') 52 53 # Use only spaces for title when printing it 54 page_title_human = page_title.replace('_', ' ') 55 pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title_human)) 56 iw_found = iw_found + 1 57 58 # Construct full URL for the particular wiki 59 iw_url = interwiki_urls[cur] + page_title 60 61 # Adjust URL if this is a foreign-language WP link 62 if re.match("^[a-zA-Z]{2}:", page_title): 63 lang_code = page_title[0:2] + "." 64 # "wp:" is the Wikipedia: namespace, not a language 65 if lang_code != "wp." and lang_code != "WP.": 66 iw_url = iw_url.replace('en.', lang_code) 67 iw_url = iw_url.replace(page_title[0:3], '') 68 69 # Test the URL 70 response = fetch(iw_url) 71 72 # One way we tell that a redirect occurred is by checking the history 73 if response.history != []: 74 pywikibot.stdout(' ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url)) 75 errors_issued = errors_issued + 1 76 elif response.status_code != 200: 77 pywikibot.stdout(' ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url)) 78 errors_issued = errors_issued + 1 79 # The usual way that a redirect occurs is that MediaWiki redirects us sneakily 80 # using JavaScript, while returning code OK 200 as if the link was correct; we 81 # must detect this from the page source 82 elif 'Redirected from <a' in response.text: 83 # Extract link from this source which contains name of redirected-to page: 84 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> 85 canonical_name = response.text.split('<link rel="canonical" href="')[-1] 86 prefix_length = len(interwiki_urls[cur]) 87 canonical_name = canonical_name[prefix_length:] 88 tag_end = canonical_name.find('"/>') 89 if tag_end == -1: 90 pywikibot.stdout(' ERROR: This is a redirect page (but I could not isolate the correct page name).') 91 else: 92 canonical_name = canonical_name[:tag_end] 93 if len(canonical_name) > 100: 94 # Certain things can cause the trim to fail; here we avoid slamming 95 # the output with massive page source from a failed trim 96 pywikibot.stdout(' ERROR: This is a redirect to "{}" (string trimmed to 100 chars due to excessive length).'.format(canonical_name[:100])) 97 else: 98 canonical_name = canonical_name.replace('_', ' ') 99 pywikibot.stdout(' ERROR: This is a redirect to "{}".'.format(canonical_name)) 100 errors_issued = errors_issued + 1 101 elif '#' in page_title: 102 # Isolate section link 103 page_name, anchor_name = page_title.split('#') 104 105 # Convert dot-notation hex entities to proper characters 106 anchor_name = anchor_name.replace('.22', '"') 107 anchor_name = anchor_name.replace('.27', '\'') 108 anchor_name = anchor_name.replace('.28', '(') 109 anchor_name = anchor_name.replace('.29', ')') 110 111 # Read linked page to see if it really has this anchor link 112 soup = BeautifulSoup(response.text, 'html.parser') 113 found_section = False 114 for span_tag in soup.findAll('span'): 115 span_name = span_tag.get('id', None) 116 if span_name == anchor_name: 117 found_section = True 118 break 119 if found_section == False: 120 pywikibot.stdout(' ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name)) 121 errors_issued = errors_issued + 1 122 cur = cur + 1 37 def scan_for_interwiki_links(page_text, page_name): 38 global debug 39 global pages_checked 40 global iw_found 41 global errors_issued 42 pages_checked = pages_checked + 1 43 cur = 0 44 name_printed = 0 45 46 for prefix in interwiki_prefixes: 47 # Isolate strings that start with "[[prefix:" and end with "|" or "]" 48 iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])" 49 for match in re.finditer(iw_link, page_text): 50 # Extract just the page title from this regex match 51 s = match.start() + 2 + len(prefix) + 1 52 e = match.end() - 1 53 54 # Sometimes we used a space char. instead of a '_', so fix that before querying 55 page_title = page_text[s:e].replace(' ', '_') 56 57 # Use only spaces for title when printing it 58 page_title_human = page_title.replace('_', ' ') 59 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title_human)) 60 iw_found = iw_found + 1 61 62 # Construct full URL for the particular wiki 63 iw_url = interwiki_urls[cur] + page_title 64 65 # Adjust URL if this is a foreign-language WP link 66 if re.match("^[a-zA-Z]{2}:", page_title): 67 lang_code = page_title[0:2] + "." 68 # "wp:" is the Wikipedia: namespace, not a language 69 if lang_code != "wp." and lang_code != "WP.": 70 iw_url = iw_url.replace('en.', lang_code) 71 iw_url = iw_url.replace(page_title[0:3], '') 72 73 # Test the URL 74 response = fetch(iw_url) 75 76 # One way we tell that a redirect occurred is by checking the history 77 if response.history != []: 78 if not name_printed and not debug: 79 pywikibot.stdout('From page "{}":'.format(page_name)) 80 name_printed = 1 81 if page_title.startswith('WP:') and page_title == page_title.upper(): 82 pywikibot.stdout(' ERROR: Got redirection code ({0}) for {1} link "{2}", but this appears to be a deliberate use of a Wikipedia shortcut. You should check the link manually.'.format(response.history[0], prefix, page_title)) 83 else: 84 pywikibot.stdout(' ERROR: Got redirection code ({0}) for {1} link "{2}". You should check the link manually.'.format(response.history[0], prefix, page_title)) 85 errors_issued = errors_issued + 1 86 elif response.status_code != 200: 87 if not name_printed and not debug: 88 pywikibot.stdout('From page "{}":'.format(page_name)) 89 name_printed = 1 90 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(response.status_code, prefix, page_title)) 91 errors_issued = errors_issued + 1 92 # The usual way that a redirect occurs is that MediaWiki redirects us sneakily 93 # using JavaScript, while returning code OK 200 as if the link was correct; we 94 # must detect this from the page source 95 elif 'Redirected from <a' in response.text: 96 if not name_printed and not debug: 97 pywikibot.stdout('From page "{}":'.format(page_name)) 98 name_printed = 1 99 # Extract link from this source which contains name of redirected-to page: 100 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> 101 canonical_name = response.text.split('<link rel="canonical" href="')[-1] 102 prefix_length = len(interwiki_urls[cur]) 103 canonical_name = canonical_name[prefix_length:] 104 tag_end = canonical_name.find('"/>') 105 if tag_end == -1: 106 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.', format(prefix, page_title)) 107 else: 108 canonical_name = canonical_name[:tag_end] 109 if len(canonical_name) > 100: 110 # Certain things can cause the trim to fail; here we avoid slamming 111 # the output with massive page source from a failed trim 112 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string trimmed to 100 chars).'.format(prefix, page_title, canonical_name[:100])) 113 else: 114 canonical_name = canonical_name.replace('_', ' ') 115 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}".'.format(prefix, page_title, canonical_name)) 116 errors_issued = errors_issued + 1 117 elif '#' in page_title: 118 # Isolate section link 119 target_page_name, anchor_name = page_title.split('#') 120 121 # Convert dot-notation hex entities to proper characters 122 anchor_name = anchor_name.replace('.22', '"') 123 anchor_name = anchor_name.replace('.27', '\'') 124 anchor_name = anchor_name.replace('.28', '(') 125 anchor_name = anchor_name.replace('.29', ')') 126 127 # Read linked page to see if it really has this anchor link 128 soup = BeautifulSoup(response.text, 'html.parser') 129 found_section = False 130 for span_tag in soup.findAll('span'): 131 span_name = span_tag.get('id', None) 132 if span_name == anchor_name: 133 found_section = True 134 break 135 if found_section == False: 136 if not name_printed and not debug: 137 pywikibot.stdout('From page "{}":'.format(page_name)) 138 name_printed = 1 139 target_page_name_human = target_page_name.replace('_', ' ') 140 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, prefix, target_page_name_human)) 141 errors_issued = errors_issued + 1 142 cur = cur + 1 123 143 124 144 def main(*args): 125 cat_name = '' 126 page_name = '' 127 128 local_args = pywikibot.handle_args(args) 129 genFactory = pagegenerators.GeneratorFactory() 130 131 for arg in local_args: 132 if arg.startswith('-cat:'): 133 cat_name = arg[5:] 134 elif arg.startswith('-page:'): 135 page_name = arg[6:] 136 137 site = pywikibot.Site() 138 139 #pywikibot.stdout('The members of the requests.models.Response class are:') 140 #pywikibot.stdout(format(dir(requests.models.Response))) 141 142 if cat_name != '': 143 cat_obj = pywikibot.Category(site, cat_name) 144 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) 145 for page in pagegenerators.PreloadingGenerator(generator, 100): 146 pywikibot.stdout('Checking page "{}"'.format(page.title())) 147 scan_for_iw_links(page.text) 148 elif page_name != '': 149 page = pywikibot.Page(site, page_name) 150 pywikibot.stdout('Checking page "{}"'.format(page.title())) 151 scan_for_iw_links(page.text) 152 153 global pages_checked 154 global iw_found 155 global errors_issued 156 157 page_str = "pages" 158 if pages_checked == 1: 159 page_str = "page" 160 161 link_str = "links" 162 if iw_found == 1: 163 link_str = "link" 164 165 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 166 167 error_str = "errors were" 168 if errors_issued == 1: 169 error_str = "error was" 170 171 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str)) 145 global debug 146 search_cat = '' 147 search_page = '' 148 149 local_args = pywikibot.handle_args(args) 150 genFactory = pagegenerators.GeneratorFactory() 151 152 for arg in local_args: 153 if arg.startswith('-cat:'): 154 search_cat = arg[5:] 155 elif arg.startswith('-page:'): 156 search_page = arg[6:] 157 elif arg == '-dbg': 158 debug = 1 159 else: 160 pywikibot.stdout('Unknown argument "{}".'.format(arg)) 161 return 162 163 site = pywikibot.Site() 164 165 #pywikibot.stdout('The members of the requests.models.Response class are:') 166 #pywikibot.stdout(format(dir(requests.models.Response))) 167 168 if search_cat != '': 169 cat_obj = pywikibot.Category(site, search_cat) 170 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) 171 for page in pagegenerators.PreloadingGenerator(generator, 100): 172 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title())) 173 scan_for_interwiki_links(page.text, page.title()) 174 elif search_page != '': 175 page = pywikibot.Page(site, search_page) 176 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title())) 177 scan_for_interwiki_links(page.text, page.title()) 178 179 global pages_checked 180 global iw_found 181 global errors_issued 182 183 page_str = "pages" 184 if pages_checked == 1: 185 page_str = "page" 186 187 link_str = "links" 188 if iw_found == 1: 189 link_str = "link" 190 191 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 192 193 error_str = "errors were" 194 if errors_issued == 1: 195 error_str = "error was" 196 197 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str)) 172 198 173 199 if __name__ == '__main__': 174 200 main()
Note:
See TracChangeset
for help on using the changeset viewer.