Changeset 1185
- Timestamp:
- Aug 15, 2023, 4:03:16 AM (18 months ago)
- Location:
- ValBot
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_interwiki_links.py
r1180 r1185 13 13 14 14 import pywikibot 15 import bs4 15 16 import re 16 17 import requests # for listing members with dir() … … 33 34 iw_found = 0 34 35 errors_issued = 0 36 name_printed = 0 37 38 # Prints the name of a page on which something occurred, if it has not been printed before 39 def possibly_print(page_name): 40 global debug 41 global name_printed 42 43 if not name_printed and not debug: 44 pywikibot.stdout('') 45 pywikibot.stdout('From page "{}":'.format(page_name)) 46 name_printed = 1 47 48 # Search a page for the section specified in the link 49 def find_section(page_text, page_name, page_slug, prefix, print_result): 50 global errors_issued 51 52 # Isolate section link 53 target_page_name, anchor_name = page_slug.split('#') 54 target_page_name_human = target_page_name.replace('_', ' ') 55 56 # Convert dot-notation hex entities to proper characters 57 anchor_name = anchor_name.replace('.22', '"') 58 anchor_name = anchor_name.replace('.27', '\'') 59 anchor_name = anchor_name.replace('.28', '(') 60 anchor_name = anchor_name.replace('.29', ')') 61 62 # Read linked page to see if it really has this anchor link 63 soup = BeautifulSoup(page_text, 'html.parser') 64 found_section = False 65 for span_tag in soup.findAll('span'): # search for span with ID matching the section name 66 span_name = span_tag.get('id', None) 67 if span_name == anchor_name: 68 found_section = True 69 break 70 if found_section == False: 71 for span_tag in soup.findAll('div'): # search for div with ID matching the section name 72 span_name = span_tag.get('id', None) 73 if span_name == anchor_name: 74 found_section = True 75 break 76 if found_section == False: 77 possibly_print(page_name) 78 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, prefix, target_page_name_human)) 79 errors_issued = errors_issued + 1 80 elif print_result == True: 81 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, prefix, target_page_name_human)) 82 83 # For a link that redirected us to another page, extract the name of the target page from 84 # the target page's source 85 def find_canonical_link(page_text, page_name, page_slug, prefix, prefix_url): 86 # Extract link from this markup which contains name of redirected-to page: 87 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> 88 canonical_name = page_text.split('<link rel="canonical" href="')[-1] 89 prefix_length = len(prefix_url) 90 canonical_name = canonical_name[prefix_length:] 91 tag_end = canonical_name.find('">') 92 93 if tag_end == -1: 94 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(prefix, page_slug)) 95 errors_issued = errors_issued + 1 96 else: 97 canonical_name = canonical_name[:tag_end] 98 if len(canonical_name) > 100: 99 # Certain things can cause the trim to fail; report error and avoid slamming the 100 # output with massive page source from a failed trim 101 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(prefix, page_slug, canonical_name[:100])) 102 errors_issued = errors_issued + 1 103 else: 104 canonical_name = canonical_name.replace('_', ' ') 105 if '#' in page_slug: 106 _, anchor_name = page_slug.split('#') 107 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking section link….'.format(prefix, page_slug, canonical_name, anchor_name)) 108 find_section(page_text, page_name, page_slug, prefix, True) 109 else: 110 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(prefix, page_slug, canonical_name)) 111 112 # Test an interwiki link and look for a section link if applicable 113 def test_interwiki_link(prefix, prefix_url, iw_url, page_name, page_slug): 114 global errors_issued 115 116 response = fetch(iw_url) 117 118 # One way we tell that a redirect occurred is by checking fetch's history, as it 119 # automatically follows redirects. This will catch formal redirects which come from pages 120 # such as Special:PermanentLink. 121 if response.history != []: 122 possibly_print(page_name) 123 124 if page_slug.startswith('WP:') and page_slug == page_slug.upper(): 125 pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(response.history[0], prefix, page_slug)) 126 find_canonical_link(response.text, page_name, page_slug, prefix, prefix_url) 127 else: 128 permalink1 = 'Special:PermanentLink/'.lower() 129 permalink2 = 'Special:Permalink/'.lower() 130 page_slug_lower = page_slug.lower() 131 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2): 132 pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(response.history[0], prefix, page_slug)) 133 find_canonical_link(response.text, page_name, page_slug, prefix, prefix_url) 134 else: 135 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(response.history[0], prefix, page_slug)) 136 errors_issued = errors_issued + 1 137 elif response.status_code != 200: 138 possibly_print(page_name) 139 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(response.status_code, prefix, page_slug)) 140 errors_issued = errors_issued + 1 141 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily 142 # using JavaScript, while returning code OK 200 as if the link was correct; this happens 143 # when a redirect page is accessed. We must detect these soft redirects by looking at the 144 # page source to find the redirect note inserted at the top of the page for the reader. 145 elif 'Redirected from <a' in response.text: 146 possibly_print(page_name) 147 pywikibot.stdout(' Got silently redirected by {0} link "{1}". Checking the target page….'.format(prefix, page_slug)) 148 find_canonical_link(response.text, page_name, page_slug, prefix, prefix_url) 149 elif '#' in page_slug: 150 find_section(response.text, page_name, page_slug, prefix, False) 35 151 36 152 # Searches the given page text for interwiki links … … 39 155 global pages_checked 40 156 global iw_found 41 global errors_issued157 global name_printed 42 158 pages_checked = pages_checked + 1 43 cur = 0159 cur_prefix = 0 44 160 name_printed = 0 45 161 … … 52 168 e = match.end() - 1 53 169 54 # Sometimes we used a space char. instead of a '_', so fix that before querying55 page_ title= page_text[s:e].replace(' ', '_')56 57 # Use onlyspaces for title when printing it58 page_title_human = page_ title.replace('_', ' ')170 # Commonly we use spaces instead of underscores, so fix that before querying 171 page_slug = page_text[s:e].replace(' ', '_') 172 173 # But use spaces for title when printing it 174 page_title_human = page_slug.replace('_', ' ') 59 175 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title_human)) 60 176 iw_found = iw_found + 1 61 177 62 178 # Construct full URL for the particular wiki 63 iw_url = interwiki_urls[cur ] + page_title179 iw_url = interwiki_urls[cur_prefix] + page_slug 64 180 65 181 # Adjust URL if this is a foreign-language WP link 66 if re.match("^[a-zA-Z]{2}:", page_ title):67 lang_code = page_ title[0:2] + "."182 if re.match("^[a-zA-Z]{2}:", page_slug): 183 lang_code = page_slug[0:2] + "." 68 184 # "wp:" is the Wikipedia: namespace, not a language 69 185 if lang_code != "wp." and lang_code != "WP.": 70 186 iw_url = iw_url.replace('en.', lang_code) 71 iw_url = iw_url.replace(page_ title[0:3], '')187 iw_url = iw_url.replace(page_slug[0:3], '') 72 188 73 189 # Test the URL 74 response = fetch(iw_url) 75 76 # One way we tell that a redirect occurred is by checking the history 77 if response.history != []: 78 if not name_printed and not debug: 79 pywikibot.stdout('From page "{}":'.format(page_name)) 80 name_printed = 1 81 if page_title.startswith('WP:') and page_title == page_title.upper(): 82 pywikibot.stdout(' ERROR: Got redirection code ({0}) for {1} link "{2}", but this appears to be a deliberate use of a Wikipedia shortcut. You should check the link manually.'.format(response.history[0], prefix, page_title)) 83 else: 84 pywikibot.stdout(' ERROR: Got redirection code ({0}) for {1} link "{2}". You should check the link manually.'.format(response.history[0], prefix, page_title)) 85 errors_issued = errors_issued + 1 86 elif response.status_code != 200: 87 if not name_printed and not debug: 88 pywikibot.stdout('From page "{}":'.format(page_name)) 89 name_printed = 1 90 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(response.status_code, prefix, page_title)) 91 errors_issued = errors_issued + 1 92 # The usual way that a redirect occurs is that MediaWiki redirects us sneakily 93 # using JavaScript, while returning code OK 200 as if the link was correct; we 94 # must detect this from the page source 95 elif 'Redirected from <a' in response.text: 96 if not name_printed and not debug: 97 pywikibot.stdout('From page "{}":'.format(page_name)) 98 name_printed = 1 99 # Extract link from this source which contains name of redirected-to page: 100 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> 101 canonical_name = response.text.split('<link rel="canonical" href="')[-1] 102 prefix_length = len(interwiki_urls[cur]) 103 canonical_name = canonical_name[prefix_length:] 104 tag_end = canonical_name.find('"/>') 105 if tag_end == -1: 106 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.', format(prefix, page_title)) 107 else: 108 canonical_name = canonical_name[:tag_end] 109 if len(canonical_name) > 100: 110 # Certain things can cause the trim to fail; here we avoid slamming 111 # the output with massive page source from a failed trim 112 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string trimmed to 100 chars).'.format(prefix, page_title, canonical_name[:100])) 113 else: 114 canonical_name = canonical_name.replace('_', ' ') 115 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}".'.format(prefix, page_title, canonical_name)) 116 errors_issued = errors_issued + 1 117 elif '#' in page_title: 118 # Isolate section link 119 target_page_name, anchor_name = page_title.split('#') 120 121 # Convert dot-notation hex entities to proper characters 122 anchor_name = anchor_name.replace('.22', '"') 123 anchor_name = anchor_name.replace('.27', '\'') 124 anchor_name = anchor_name.replace('.28', '(') 125 anchor_name = anchor_name.replace('.29', ')') 126 127 # Read linked page to see if it really has this anchor link 128 soup = BeautifulSoup(response.text, 'html.parser') 129 found_section = False 130 for span_tag in soup.findAll('span'): 131 span_name = span_tag.get('id', None) 132 if span_name == anchor_name: 133 found_section = True 134 break 135 if found_section == False: 136 if not name_printed and not debug: 137 pywikibot.stdout('From page "{}":'.format(page_name)) 138 name_printed = 1 139 target_page_name_human = target_page_name.replace('_', ' ') 140 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, prefix, target_page_name_human)) 141 errors_issued = errors_issued + 1 142 cur = cur + 1 143 190 test_interwiki_link(prefix, interwiki_urls[cur_prefix], iw_url, page_name, page_slug) 191 cur_prefix = cur_prefix + 1 192 193 # Print a wrap-up message 194 def print_summary(): 195 global pages_checked 196 global iw_found 197 global errors_issued 198 199 page_str = "pages" 200 if pages_checked == 1: 201 page_str = "page" 202 203 link_str = "links" 204 if iw_found == 1: 205 link_str = "link" 206 207 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 208 209 error_str = "errors were" 210 if errors_issued == 1: 211 error_str = "error was" 212 213 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str)) 214 215 # Main function 144 216 def main(*args): 145 217 global debug … … 147 219 search_page = '' 148 220 221 # Process arguments 149 222 local_args = pywikibot.handle_args(args) 150 genFactory = pagegenerators.GeneratorFactory()151 152 223 for arg in local_args: 153 224 if arg.startswith('-cat:'): … … 158 229 debug = 1 159 230 else: 160 pywikibot.stdout('Unknown argument "{}". '.format(arg))231 pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg)) 161 232 return 162 163 site = pywikibot.Site()164 233 165 234 #pywikibot.stdout('The members of the requests.models.Response class are:') 166 235 #pywikibot.stdout(format(dir(requests.models.Response))) 167 236 #return 237 238 # Check specified page or loop through specified category and check all pages 239 site = pywikibot.Site() 168 240 if search_cat != '': 169 241 cat_obj = pywikibot.Category(site, search_cat) … … 177 249 scan_for_interwiki_links(page.text, page.title()) 178 250 179 global pages_checked 180 global iw_found 181 global errors_issued 182 183 page_str = "pages" 184 if pages_checked == 1: 185 page_str = "page" 186 187 link_str = "links" 188 if iw_found == 1: 189 link_str = "link" 190 191 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 192 193 error_str = "errors were" 194 if errors_issued == 1: 195 error_str = "error was" 196 197 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str)) 251 # Print the results 252 print_summary() 198 253 199 254 if __name__ == '__main__': -
ValBot/Python/check_intrawiki_section_links.py
r1179 r1185 27 27 onigalore_url = 'https://wiki.oni2.net/' 28 28 29 # Tuple of interwiki prefixes, for passing over such links29 # Tuple of interwiki prefixes, for recognizing and passing over such links 30 30 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp') 31 31 … … 44 44 advice_issued = 0 45 45 errors_issued = 0 46 name_printed = 0 47 48 # Prints the name of a page on which something occurred, if it has not been printed before 49 def possibly_print(page_name): 50 global debug 51 global name_printed 52 53 if not name_printed and not debug: 54 pywikibot.stdout('') 55 pywikibot.stdout('From page "{}":'.format(page_name)) 56 name_printed = 1 57 58 # Search a page for the section specified in the link 59 def find_section(page_text, page_name, page_slug, print_result): 60 global errors_issued 61 62 # Isolate section link 63 target_page_name, anchor_name = page_slug.split('#', 1) 64 target_page_name_human = target_page_name.replace('_', ' ') 65 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name)) 66 67 # Convert slash character to the dot-notation hex encoding that MediaWiki uses 68 anchor_name = anchor_name.replace('/', '.2F') 69 70 # Read linked page to see if it really has this anchor link 71 soup = BeautifulSoup(page_text, 'html.parser') 72 found_section = False 73 for span_tag in soup.findAll('span'): 74 span_name = span_tag.get('id', None) 75 if span_name == anchor_name: 76 if debug and not print_result: pywikibot.stdout(' Found section in a span!') 77 found_section = True 78 break 79 if found_section == False: 80 # Search for a div with this ID 81 for span_tag in soup.findAll('div'): 82 span_name = span_tag.get('id', None) 83 if span_name == anchor_name: 84 if debug and not print_result: pywikibot.stdout(' Found section in a div!') 85 found_section = True 86 break 87 if found_section == False: 88 possibly_print(page_name) 89 pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, pre_section)) 90 errors_issued += 1 91 elif debug and print_result: 92 pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human)) 93 94 # For a link that redirected us to another page, extract the name of the target page from 95 # the target page's source 96 def find_canonical_link(page_text, page_name, page_slug): 97 # Extract link from this markup which contains name of redirected-to page: 98 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> 99 # "wgPageName":"Namespace:Page_name", 100 canonical_name = page_text.split('"wgPageName":"')[-1] 101 tag_end = canonical_name.find('",') 102 103 if tag_end == -1: 104 pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug)) 105 errors_issued = errors_issued + 1 106 else: 107 canonical_name = canonical_name[:tag_end] 108 if len(canonical_name) > 100: 109 # Certain things can cause the trim to fail; report error and avoid slamming the 110 # output with massive page source from a failed trim 111 pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100])) 112 errors_issued = errors_issued + 1 113 else: 114 canonical_name = canonical_name.replace('_', ' ') 115 if '#' in page_slug: 116 _, anchor_name = page_slug.split('#') 117 if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name)) 118 find_section(page_text, page_name, page_slug, True) 119 else: 120 pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name)) 121 122 # Test an intrawiki link and look for a section link if applicable 123 def test_intrawiki_link(iw_url, page_name, page_slug): 124 global advice_issued 125 global errors_issued 126 127 response = fetch(iw_url) 128 129 # One way we tell that a redirect occurred is by checking fetch's history, as it 130 # automatically follows redirects. This will catch formal redirects which come from pages 131 # such as Special:PermanentLink. 132 if response.history != []: 133 134 permalink1 = 'Special:PermanentLink/'.lower() 135 permalink2 = 'Special:Permalink/'.lower() 136 page_slug_lower = page_slug.lower() 137 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2): 138 if debug: 139 possibly_print(page_name) 140 pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug)) 141 find_canonical_link(response.text, page_name, page_slug) 142 else: 143 possibly_print(page_name) 144 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug)) 145 advice_issued += 1 146 elif response.status_code != 200: 147 possibly_print(page_name) 148 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url)) 149 errors_issued += 1 150 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily 151 # using JavaScript, while returning code OK 200 as if the link was correct; this happens 152 # when a redirect page is accessed. We must detect these soft redirects by looking at the 153 # page source to find the redirect note inserted at the top of the page for the reader. 154 elif 'Redirected from <a' in response.text: 155 if debug: 156 possibly_print(page_name) 157 pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug)) 158 find_canonical_link(response.text, page_name, page_slug) 159 else: # URL is OK, so proceed 160 find_section(response.text, page_name, page_slug, False) 46 161 47 162 # Searches the given page text for intrawiki links with section links in them … … 52 167 global advice_issued 53 168 global errors_issued 169 global name_printed 54 170 pages_checked += 1 55 171 name_printed = 0 … … 75 191 s = match.start() + target_start # remove the link-opening markup 76 192 e = match.end() - target_end # remove the link-ending markup 77 link_text= page_text[s:e]193 page_slug = page_text[s:e] 78 194 79 195 # The second link type will look like "Page|Section" or "|Section", so fix that pipe 80 196 if i == 1: 81 link_text = link_text.replace('|', '#')197 page_slug = page_slug.replace('|', '#') 82 198 83 199 # Sometimes we use a space char. instead of a '_', so fix that before querying 84 link_text = link_text.replace(' ', '_')85 if debug: pywikibot.stdout(' Found link {0}.'.format( link_text))200 page_slug = page_slug.replace(' ', '_') 201 if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug)) 86 202 87 203 # If this link doesn't have a section link in it, then we don't care about it, as 88 204 # MediaWiki takes care of checking basic intrawiki links 89 if not '#' in link_text:205 if not '#' in page_slug: 90 206 if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.') 91 207 continue … … 96 212 if found_iw_match == False: 97 213 for prefix in interwiki_prefixes: 98 if prefix + ":" in link_text:99 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format( link_text))214 if prefix + ":" in page_slug: 215 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug)) 100 216 is_interwiki = True 101 217 break … … 103 219 continue 104 220 105 # If there is a '{' in the link, then probably it's a link built on transcluded text 106 # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it 107 if '{' in link_text: 221 # If there is a '{' in the link, then probably it's a link built on transcluded text. 222 # If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using 223 # our "chapter_names" array. If it's another type of transclusion, punt it to the user. 224 if '{' in page_slug: 108 225 ch_link_pattern = re.compile(r"{{C[0-9]*}}") 109 ch_link = ch_link_pattern.search( link_text)226 ch_link = ch_link_pattern.search(page_slug) 110 227 if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0))) 111 228 if ch_link: … … 118 235 ch_name = chapter_names[ch_num_match] 119 236 replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}") 120 link_text = replace_pattern.sub(ch_name, link_text)121 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format( link_text))237 page_slug = replace_pattern.sub(ch_name, page_slug) 238 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug)) 122 239 else: 123 if not name_printed and not debug: 124 pywikibot.stdout('From page "{}":'.format(page_name)) 125 name_printed = 1 126 pywikibot.stdout(' ADVICE: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(link_text, ch_num_match)) 127 advice_issued += 1 240 possibly_print(page_name) 241 pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match)) 242 errors_issued += 1 128 243 continue 129 244 else: 130 if not name_printed and not debug: 131 pywikibot.stdout('From page "{}":'.format(page_name)) 132 name_printed = 1 133 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(link_text)) 245 possibly_print(page_name) 246 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug)) 134 247 advice_issued += 1 135 248 continue 136 249 else: 137 if not name_printed and not debug: 138 pywikibot.stdout('From page "{}":'.format(page_name)) 139 name_printed = 1 140 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(link_text, "{{C7}}")) 250 possibly_print(page_name) 251 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}")) 141 252 advice_issued += 1 142 253 continue … … 145 256 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"), 146 257 # we're out of luck. 147 if link_text.startswith('/'):148 link_text = page_name + link_text149 if debug: pywikibot.stdout(' Changed link_text to {} on account of "/".'.format(link_text))258 if page_slug.startswith('/'): 259 page_slug = page_name + page_slug 260 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug)) 150 261 151 262 # If this is a relative "../" link, find the parent page, set ourselves to that page, 152 263 # then remove the relative portion of the link. Note that this is only performed once, 153 264 # so if there's multiple steps back ("../../"), we're out of luck. 154 if link_text.startswith('../'):265 if page_slug.startswith('../'): 155 266 last_slash = page_name.rfind('/') 156 267 page_name2 = page_name[0:last_slash] 157 268 if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2)) 158 link_text = link_text[3:len(link_text)]159 if debug: pywikibot.stdout(' Changed link_text to {} on account of "../".'.format(link_text))269 page_slug = page_slug[3:len(page_slug)] 270 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug)) 160 271 # If this is now going to be a bare section link for the parent page, don't add a 161 272 # slash, otherwise do because we are drilling down to another subpage 162 if link_text.startswith('#'):163 link_text = page_name2 + link_text273 if page_slug.startswith('#'): 274 page_slug = page_name2 + page_slug 164 275 else: 165 link_text = page_name2 + '/' + link_text276 page_slug = page_name2 + '/' + page_slug 166 277 167 278 # If this is a bare section link, build URL based on this page 168 if link_text.startswith('#'):279 if page_slug.startswith('#'): 169 280 iw_url = onigalore_url + page_name2 170 281 iw_found += 1 171 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format( link_text))282 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug)) 172 283 found_iw_match = True 173 link_text = page_name2 + link_text284 page_slug = page_name2 + page_slug 174 285 175 286 # If there's no ":" in the link (before the section link, where a colon would just be 176 287 # part of the text) then it's a Main namespace article; proceed with building URL 177 288 if found_iw_match == False: 178 if not re.search(":.*#", link_text):179 iw_url = onigalore_url + link_text289 if not re.search(":.*#", page_slug): 290 iw_url = onigalore_url + page_slug 180 291 iw_found += 1 181 292 if debug: pywikibot.stdout(' Link is to a Main namespace page.') … … 186 297 if found_iw_match == False: 187 298 for prefix in intrawiki_prefixes: 188 if prefix + ":" in link_text:189 iw_url = onigalore_url + link_text299 if prefix + ":" in page_slug: 300 iw_url = onigalore_url + page_slug 190 301 if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix)) 191 302 iw_found += 1 … … 195 306 # If we still haven't turned this match into a URL, something's gone wrong 196 307 if (found_iw_match == False) or (iw_url == ""): 197 if not name_printed and not debug: 198 pywikibot.stdout('From page "{}":'.format(page_name)) 199 name_printed = 1 200 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(link_text)) 308 possibly_print(page_name) 309 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug)) 201 310 continue 202 311 203 312 # Test the URL 204 313 iw_url = iw_url.replace(' ', '_') 205 if debug: pywikibot.stdout(' Reading page at {}...'.format(iw_url)) 206 response = fetch(iw_url) 207 208 # Redirects are followed automatically by fetch() and treated as "200"s; the way we can 209 # tell that a redirect occurred is by checking fetch's history 210 if response.history != []: 211 if not name_printed and not debug: 212 pywikibot.stdout('From page "{}":'.format(page_name)) 213 name_printed = 1 214 pywikibot.stdout(' ADVICE: Got redirection code ({0}) on URL "{1}". You should check the link manually.'.format(response.history[0], iw_url)) 215 advice_issued += 1 216 elif response.status_code != 200: 217 if not name_printed and not debug: 218 pywikibot.stdout('From page "{}":'.format(page_name)) 219 name_printed = 1 220 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url)) 221 errors_issued += 1 222 else: 223 # Isolate section link 224 pre_section, section_name = link_text.split('#', 1) 225 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(section_name)) 226 227 # Convert slash character to the dot-notation hex encoding that MediaWiki uses 228 section_name = section_name.replace('/', '.2F') 229 230 # Read linked page to see if it really has this anchor link 231 soup = BeautifulSoup(response.text, 'html.parser') 232 found_section = False 233 for span_tag in soup.findAll('span'): 234 span_name = span_tag.get('id', None) 235 if span_name == section_name: 236 if debug: pywikibot.stdout(' Found section!') 237 found_section = True 238 break 239 if found_section == False: 240 if not name_printed and not debug: 241 pywikibot.stdout('From page "{}":'.format(page_name)) 242 name_printed = 1 243 pywikibot.stdout(' ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section)) 244 errors_issued += 1 245 246 def main(*args): 247 global debug 314 if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url)) 315 test_intrawiki_link(iw_url, page_name, page_slug) 316 317 # Print a wrap-up message 318 def print_summary(): 248 319 global pages_checked 249 320 global iw_found 250 321 global advice_issued 251 322 global errors_issued 323 324 page_str = "pages" 325 if pages_checked == 1: 326 page_str = "page" 327 328 link_str = "links" 329 if iw_found == 1: 330 link_str = "link" 331 332 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 333 pywikibot.stdout('While attempting to follow section links….') 334 335 if advice_issued == 0: 336 pywikibot.stdout(' No advice on potential problems was issued.') 337 elif advice_issued == 1: 338 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.') 339 else: 340 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued)) 341 342 error_str = "errors were" 343 if errors_issued == 1: 344 error_str = "error was" 345 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str)) 346 347 # Main function 348 def main(*args): 349 global debug 252 350 search_cat = '' 253 351 search_page = '' 254 352 353 # Process arguments 255 354 local_args = pywikibot.handle_args(args) 256 genFactory = pagegenerators.GeneratorFactory()257 258 355 for arg in local_args: 259 356 if arg.startswith('-cat:'): … … 272 369 #pywikibot.stdout(format(dir(page))) 273 370 371 # Check specified page or loop through specified category and check all pages 274 372 if search_cat != '': 275 373 cat_obj = pywikibot.Category(site, search_cat) … … 283 381 scan_for_intrawiki_links(page.text, page.title()) 284 382 285 page_str = "pages" 286 if pages_checked == 1: 287 page_str = "page" 288 289 link_str = "links" 290 if iw_found == 1: 291 link_str = "link" 292 293 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 294 pywikibot.stdout('While attempting to follow section links...') 295 296 if advice_issued == 0: 297 pywikibot.stdout(' No advice on potential problems was issued.') 298 elif advice_issued == 1: 299 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.') 300 else: 301 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued)) 302 303 error_str = "errors were" 304 if errors_issued == 1: 305 error_str = "error was" 306 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str)) 383 # Print the results 384 print_summary() 307 385 308 386 if __name__ == '__main__':
Note:
See TracChangeset
for help on using the changeset viewer.