Changeset 1174 for ValBot/Python
- Timestamp:
- Jun 29, 2022, 12:11:41 AM (2 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_interwiki_links.py
r1170 r1174 14 14 import pywikibot 15 15 import re 16 import requests 16 import requests # for listing members with dir() 17 17 18 18 from pywikibot.bot import QuitKeyboardInterrupt … … 51 51 page_title = page_text[s:e].replace(' ', '_') 52 52 53 # Use only spaces for title when printing it 54 page_title_human = page_title.replace('_', ' ') 55 pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title_human)) 56 iw_found = iw_found + 1 57 53 58 # Construct full URL for the particular wiki 54 59 iw_url = interwiki_urls[cur] + page_title 55 pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title))56 iw_found = iw_found + 157 60 58 61 # Adjust URL if this is a foreign-language WP link … … 65 68 66 69 # Test the URL 67 #pywikibot.stdout(' Testing URL "{}"'.format(iw_url))68 70 response = fetch(iw_url) 69 71 70 # Redirects are followed automatically by fetch() and treated as "200"s, so the 71 # way we tell that a redirect occurred is by checking the history 72 # One way we tell that a redirect occurred is by checking the history 72 73 if response.history != []: 73 74 pywikibot.stdout(' ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url)) … … 75 76 elif response.status_code != 200: 76 77 pywikibot.stdout(' ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url)) 78 errors_issued = errors_issued + 1 79 # The usual way that a redirect occurs is that MediaWiki redirects us sneakily 80 # using JavaScript, while returning code OK 200 as if the link was correct; we 81 # must detect this from the page source 82 elif 'Redirected from <a' in response.text: 83 # Extract link from this source which contains name of redirected-to page: 84 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> 85 canonical_name = response.text.split('<link rel="canonical" href="')[-1] 86 prefix_length = len(interwiki_urls[cur]) 87 canonical_name = canonical_name[prefix_length:] 88 tag_end = canonical_name.find('"/>') 89 if tag_end == -1: 90 pywikibot.stdout(' ERROR: This is a redirect page (but I could not isolate the correct page name).') 91 else: 92 canonical_name = canonical_name[:tag_end] 93 if len(canonical_name) > 100: 94 # Certain things can cause the trim to fail; here we avoid slamming 95 # the output with massive page source from a failed trim 96 pywikibot.stdout(' ERROR: This is a redirect to "{}" (string trimmed to 100 chars due to excessive length).'.format(canonical_name[:100])) 97 else: 98 canonical_name = canonical_name.replace('_', ' ') 99 pywikibot.stdout(' ERROR: This is a redirect to "{}".'.format(canonical_name)) 77 100 errors_issued = errors_issued + 1 78 101 elif '#' in page_title: … … 92 115 span_name = span_tag.get('id', None) 93 116 if span_name == anchor_name: 94 #pywikibot.stdout('Found section!')95 117 found_section = True 96 118 break
Note:
See TracChangeset
for help on using the changeset viewer.