Changeset 1171
- Timestamp:
- Mar 21, 2022, 10:23:25 PM (3 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
ValBot/Python/check_intrawiki_section_links.py
r1169 r1171 1 # Check Intrawiki Section Links 2 # by iritscen@yahoo.com 3 # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'), 4 # and loads the linked page and verifies that the named section actually exists. The output will 5 # use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters. 6 # Recommended viewing width: 7 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --| 8 1 9 import os 2 10 … … 24 32 pages_checked = 0 25 33 iw_found = 0 26 problems_found = 0 34 advice_issued = 0 35 warnings_issued = 0 36 errors_issued = 0 27 37 page_name = '' 28 38 … … 31 41 global pages_checked 32 42 global iw_found 33 global problems_found 43 global advice_issued 44 global warnings_issued 45 global errors_issued 34 46 global page_name 35 47 pages_checked = pages_checked + 1 … … 51 63 # Sometimes we used a space char. instead of a '_', so fix that before querying 52 64 link_text = link_text.replace(' ', '_') 53 #pywikibot. output('Found link {0}.'.format(link_text))65 #pywikibot.stdout('Found link {0}.'.format(link_text)) 54 66 55 67 # If this link doesn't have a section link in it, then we don't care about it, as 56 68 # MediaWiki takes care of checking basic intrawiki links 57 69 if not '#' in link_text: 58 #pywikibot. output('Link doesn\'t have a section anchor in it. Skipping.')70 #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.') 59 71 continue 60 72 … … 62 74 # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it 63 75 if '{' in link_text: 64 pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text)) 76 pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text)) 77 advice_issued = advice_issued + 1 65 78 continue 66 67 # If this is a relative "../" link, find the parent page and set ourselves to that 68 # page, then remove the relative portion of the link. Note that this is only performed 69 # once, so if there's multiple steps back ("../../"), we're out of luck. 79 80 # If this is a relative "/" link, use the current page as the basis for the URL. Note 81 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"), 82 # we're out of luck. 83 if link_text.startswith('/'): 84 link_text = page_name + link_text 85 pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text)) 86 87 # If this is a relative "../" link, find the parent page and set ourselves to that page, 88 # then remove the relative portion of the link. Note that this is only performed once, 89 # so if there's multiple steps back ("../../"), we're out of luck. 70 90 if link_text.startswith('../'): 71 91 last_slash = page_name.rfind('/') 72 92 page_name2 = page_name[0:last_slash] 73 #pywikibot. output('Changed page_name to {} on account of "../".'.format(page_name2))93 #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2)) 74 94 link_text = link_text[3:len(link_text)] 75 #pywikibot. output('Changed link_text to {} on account of "../".'.format(link_text))76 # If this is now going to be a bare section link for the parent page, don't add 77 # aslash, otherwise do because we are drilling down to another subpage95 #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text)) 96 # If this is now going to be a bare section link for the parent page, don't add a 97 # slash, otherwise do because we are drilling down to another subpage 78 98 if link_text.startswith('#'): 79 99 link_text = page_name2 + link_text … … 85 105 iw_url = onigalore_url + page_name2 86 106 iw_found = iw_found + 1 87 #pywikibot. output('Found link to this very page, {}.'.format(link_text))107 #pywikibot.stdout('Found link to this very page, {}.'.format(link_text)) 88 108 found_iw_match = True 89 109 link_text = page_name2 + link_text … … 91 111 # If there's no ":" in the link (before the section link, where a colon would just be 92 112 # part of the text) then it's a Main namespace article, so construct URL 93 #if not ':' in link_text:94 113 if found_iw_match == False: 95 114 if not re.search(":.*#", link_text): 96 115 iw_url = onigalore_url + link_text 97 116 iw_found = iw_found + 1 98 #pywikibot. output('Found link to OniGalore Main namespace page {}.'.format(link_text))117 #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text)) 99 118 found_iw_match = True 100 119 … … 102 121 if found_iw_match == False: 103 122 for prefix in intrawiki_prefixes: 104 #pywikibot. output('Comparing link against prefix {}.'.format(prefix))123 #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix)) 105 124 if prefix + ":" in link_text: 106 125 iw_url = onigalore_url + link_text 107 126 _, post_ns = link_text.split(':', 1) 108 #pywikibot. output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))127 #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns)) 109 128 iw_found = iw_found + 1 110 129 found_iw_match = True … … 117 136 for prefix in interwiki_prefixes: 118 137 if prefix + ":" in link_text: 119 #pywikibot. output('Skipping link {} because it is an interwiki link.'.format(link_text))138 #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text)) 120 139 is_interwiki = True 121 140 break … … 125 144 # If we still haven't turned this match into a URL, something's gone wrong 126 145 if (found_iw_match == False) or (iw_url == ""): 127 pywikibot. output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))146 pywikibot.stdout('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text)) 128 147 quit() 129 148 130 149 # Test the URL 131 150 iw_url = iw_url.replace(' ', '_') 132 #pywikibot. output('Reading page at {}...'.format(iw_url))151 #pywikibot.stdout('Reading page at {}...'.format(iw_url)) 133 152 response = fetch(iw_url) 134 153 135 # Redirects are followed automatically by fetch() and treated as "200"s , so the136 # way we tell that a redirect occurred is by checking thehistory154 # Redirects are followed automatically by fetch() and treated as "200"s; the way we can 155 # tell that a redirect occurred is by checking fetch's history 137 156 if response.history != []: 138 pywikibot. output('WARNING: Redirected from {}.'.format(response.history))139 problems_found = problems_found + 1157 pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url)) 158 warnings_issued = warnings_issued + 1 140 159 elif response.status_code != 200: 141 #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg142 problems_found = problems_found + 1160 pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url)) 161 warnings_issued = warnings_issued + 1 143 162 else: 144 163 # Isolate section link 145 164 pre_section, section_name = link_text.split('#', 1) 146 #pywikibot. output('Searching for section link {} on page.'.format(section_name))165 #pywikibot.stdout('Searching for section link {} on page.'.format(section_name)) 147 166 148 167 # Convert slash character to the dot-notation hex encoding that MediaWiki uses … … 155 174 span_name = span_tag.get('id', None) 156 175 if span_name == section_name: 157 #pywikibot. output('Found section!')176 #pywikibot.stdout('Found section!') 158 177 found_section = True 159 178 break 160 179 if found_section == False: 161 pywikibot. output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))162 problems_found = problems_found + 1180 pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section)) 181 errors_issued = errors_issued + 1 163 182 164 183 def main(*args): … … 194 213 global pages_checked 195 214 global iw_found 196 global problems_found 197 pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found)) 215 global advice_issued 216 global warnings_issued 217 global errors_issued 218 219 page_str = "pages" 220 if pages_checked == 1: 221 page_str = "page" 222 223 link_str = "links" 224 if iw_found == 1: 225 link_str = "link" 226 227 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) 228 pywikibot.stdout('While attempting to follow section links...') 229 230 if advice_issued == 0: 231 pywikibot.stdout(' No advice on potential problems was issued.') 232 elif advice_issued == 1: 233 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.') 234 else: 235 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued)) 236 237 warning_str = "warnings were" 238 if warnings_issued == 1: 239 warning_str = "warning was" 240 pywikibot.stdout(' {0} {1} issued.'.format(warnings_issued, warning_str)) 241 242 error_str = "errors were" 243 if errors_issued == 1: 244 error_str = "error was" 245 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str)) 198 246 199 247 if __name__ == '__main__':
Note:
See TracChangeset
for help on using the changeset viewer.