source: ValBot/Python/check_intrawiki_section_links.py@ 1211

Last change on this file since 1211 was 1205, checked in by iritscen, 3 months ago

ValBot: Revised logic in check_intrawiki_section_links.py as MediaWiki now apparently returns response 301 when the user is redirected by a redirect page.

File size: 19.2 KB
Line 
1# Check Intrawiki Section Links
2# by iritscen@yahoo.com
3# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4# and loads the linked page and verifies that the named section actually exists. It also
5# understands section links generated through a call to Template:SectionLink.
6# Recommended viewing width:
7# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8
9import os
10
11from urllib.parse import urljoin
12
13import pywikibot
14import re
15
16from pywikibot.bot import QuitKeyboardInterrupt
17from pywikibot import pagegenerators
18from pywikibot.comms.http import fetch
19from pywikibot.specialbots import UploadRobot
20from bs4 import BeautifulSoup
21
22# Tuple of OniGalore's namespaces
23intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
24
25# URL for main namespace of our wiki
26onigalore_url = 'https://wiki.oni2.net/'
27
28# Tuple of interwiki prefixes, for recognizing and passing over such links
29interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
30
31# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
32chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
33
34# Tuple of patterns for recognizing wikilinks
35# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any|thing]]", "[[any:thi|ng]]"
36# Pattern 2: Detect "{{SectionLink|Page|Section name}}", "{{SectionLink||Section name}}"
37link_patterns = (r"\[\[[^|\]]*(\||\])", r"\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}")
38
39# Initialize globals
40debug = 0
41pages_checked = 0
42iw_found = 0
43advice_issued = 0
44errors_issued = 0
45name_printed = 0
46
47# Prints the name of a page on which something occurred, if it has not been printed before
48def possibly_print(page_name):
49 global debug
50 global name_printed
51
52 if not name_printed and not debug:
53 pywikibot.stdout('')
54 pywikibot.stdout('From page "{}":'.format(page_name))
55 name_printed = 1
56
57# Search a page for the section specified in the link
58def find_section(page_text, page_name, page_slug, print_result):
59 global errors_issued
60 found_section = False
61
62 # Isolate section link or text fragment link
63 target_page_name, anchor_name = page_slug.split('#', 1)
64 target_page_name_human = target_page_name.replace('_', ' ')
65
66 # First check if this is a text fragment directive, and look for it if so
67 if anchor_name.startswith(':~:text='):
68 if debug: pywikibot.stdout(' Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug))
69 anchor_name = anchor_name[8:]
70 # We're only checking the first text directive, so strip add'l ones if present
71 addl_fragment = anchor_name.find('&text=')
72 if addl_fragment != -1:
73 anchor_name = anchor_name[:addl_fragment]
74 search_terms = anchor_name.split(',')
75 # Delete prefix and suffix terms because they aren't needed
76 if search_terms[0].endswith('-'):
77 search_terms.pop(0)
78 if search_terms[-1].startswith('-'):
79 search_terms.pop()
80 # Remake text directive with the terms separated by spaces as they should be in the page
81 # text
82 newSep = ' '
83 search_string = newSep.join(search_terms)
84 if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string))
85 if search_string in page_text:
86 found_section = True
87 if debug and not print_result: pywikibot.stdout(' Found text fragment!')
88
89 # If we're still here, it's a section link; read linked page to see if it really has this
90 # anchor link
91 if found_section == False:
92 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
93 soup = BeautifulSoup(page_text, 'html.parser')
94 # Search for a span with this ID
95 for span_tag in soup.findAll('span'):
96 span_name = span_tag.get('id', None)
97 if span_name == anchor_name:
98 if debug and not print_result: pywikibot.stdout(' Found section in a span!')
99 found_section = True
100 break
101 if found_section == False:
102 # Search for a div with this ID
103 for span_tag in soup.findAll('div'):
104 span_name = span_tag.get('id', None)
105 if span_name == anchor_name:
106 if debug and not print_result: pywikibot.stdout(' Found section in a div!')
107 found_section = True
108 break
109 if found_section == False:
110 possibly_print(page_name)
111 pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
112 errors_issued += 1
113 elif debug and print_result:
114 pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
115
116# For a link that redirected us to another page, extract the name of the target page from the
117# target page's source
118def find_canonical_link(page_text, page_name, page_slug):
119 # Extract link from this markup which contains name of redirected-to page:
120 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
121 # "wgPageName":"Namespace:Page_name",
122 canonical_name = page_text.split('"wgPageName":"')[-1]
123 tag_end = canonical_name.find('",')
124
125 if tag_end == -1:
126 pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
127 errors_issued = errors_issued + 1
128 else:
129 canonical_name = canonical_name[:tag_end]
130 if len(canonical_name) > 100:
131 # Certain things can cause the trim to fail; report error and avoid slamming the output
132 # with massive page source from a failed trim
133 pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
134 errors_issued = errors_issued + 1
135 else:
136 canonical_name = canonical_name.replace('_', ' ')
137 if '#' in page_slug:
138 _, anchor_name = page_slug.split('#')
139 if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
140 find_section(page_text, page_name, page_slug, True)
141 else:
142 pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
143
144# Test an intrawiki link and look for a section link if applicable
145def test_intrawiki_link(iw_url, page_name, page_slug):
146 global advice_issued
147 global errors_issued
148
149 response = fetch(iw_url)
150
151 # One way we tell that a redirect occurred is by checking fetch's history, as it
152 # automatically follows redirects. This will catch formal redirects which come from pages
153 # such as Special:PermanentLink.
154 permalink1 = 'Special:PermanentLink/'.lower()
155 permalink2 = 'Special:Permalink/'.lower()
156 page_slug_lower = page_slug.lower()
157 if response.history != [] and (page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2)):
158 if debug:
159 possibly_print(page_name)
160 pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
161 find_canonical_link(response.text, page_name, page_slug)
162 # However the usual way that a redirect occurs is that a redirect page is visited and
163 # MediaWiki sends us to the new page using JavaScript while returning code 301. Formerly it
164 # used to return 200 as if the link was correct, so rather than looking for code 301 we
165 # detect these soft redirects by looking at the page source to find the redirect note that
166 # gets inserted at the top of the page for the reader.
167 elif 'Redirected from <a' in response.text:
168 if debug:
169 possibly_print(page_name)
170 pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
171 find_canonical_link(response.text, page_name, page_slug)
172 # This handles response codes other than 200 and 301 (301 is returned in the above case of a
173 # silent redirect)
174 elif response.status_code != 200:
175 possibly_print(page_name)
176 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
177 errors_issued += 1
178 else: # URL is OK, so proceed
179 find_section(response.text, page_name, page_slug, False)
180
181# Searches the given page text for intrawiki links with section links in them
182def scan_for_intrawiki_links(page_text, page_name):
183 global debug
184 global pages_checked
185 global iw_found
186 global advice_issued
187 global errors_issued
188 global name_printed
189 pages_checked += 1
190 name_printed = 0
191
192 for i, the_pattern in enumerate(link_patterns):
193 if debug:
194 if i == 0:
195 pywikibot.stdout(' Checking page for wikilinks with section names.')
196 elif i == 1:
197 pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
198
199 for match in re.finditer(the_pattern, page_text):
200 found_iw_match = False
201 iw_url = ""
202 page_name2 = page_name
203
204 # Cut out the matched text from the page, isolating just the page+section name
205 target_start = 2 # "[["
206 target_end = 1 # "|" or "]" (we only match the first ending bracket)
207 if i == 1:
208 target_start = 14 # "{{SectionLink|"
209 target_end = 2 # "}}"
210 s = match.start() + target_start # remove the link-opening markup
211 e = match.end() - target_end # remove the link-ending markup
212 page_slug = page_text[s:e]
213
214 # The second link type will look like "Page|Section" or "|Section", so fix that pipe
215 if i == 1:
216 page_slug = page_slug.replace('|', '#')
217
218 # Sometimes we use a space char. instead of a '_', so fix that before querying
219 page_slug = page_slug.replace(' ', '_')
220 if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
221
222 # If this link doesn't have a section link in it, then we don't care about it, as
223 # MediaWiki takes care of checking basic intrawiki links
224 if not '#' in page_slug:
225 if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
226 continue
227
228 # If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
229 # for the task of checking interwiki page+section links
230 is_interwiki = False
231 if found_iw_match == False:
232 for prefix in interwiki_prefixes:
233 if prefix + ":" in page_slug:
234 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
235 is_interwiki = True
236 break
237 if is_interwiki:
238 continue
239
240 # If there is a '{' in the link, then probably it's a link built on transcluded text.
241 # If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
242 # our "chapter_names" array. If it's another type of transclusion, punt it to the user.
243 if '{' in page_slug:
244 ch_link_pattern = re.compile(r"{{C[0-9]*}}")
245 ch_link = ch_link_pattern.search(page_slug)
246 if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
247 if ch_link:
248 ch_link_match = ch_link.group(0)
249 ch_num_pattern = re.compile("[0-9]+")
250 ch_num = ch_num_pattern.search(ch_link_match)
251 if ch_num:
252 ch_num_match = int(ch_num.group(0))
253 if ch_num_match >= 0 and ch_num_match <= 14:
254 ch_name = chapter_names[ch_num_match]
255 replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
256 page_slug = replace_pattern.sub(ch_name, page_slug)
257 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
258 else:
259 possibly_print(page_name)
260 pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
261 errors_issued += 1
262 continue
263 else:
264 possibly_print(page_name)
265 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
266 advice_issued += 1
267 continue
268 else:
269 possibly_print(page_name)
270 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
271 advice_issued += 1
272 continue
273
274 # If this is a relative "/" link, use the current page as the basis for the URL. Note
275 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
276 # we're out of luck.
277 if page_slug.startswith('/'):
278 page_slug = page_name + page_slug
279 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
280
281 # If this is a relative "../" link, find the parent page, set ourselves to that page,
282 # then remove the relative portion of the link. Note that this is only performed once,
283 # so if there's multiple steps back ("../../"), we're out of luck.
284 if page_slug.startswith('../'):
285 last_slash = page_name.rfind('/')
286 page_name2 = page_name[0:last_slash]
287 if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
288 page_slug = page_slug[3:len(page_slug)]
289 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
290 # If this is now going to be a bare section link for the parent page, don't add a
291 # slash, otherwise do because we are drilling down to another subpage
292 if page_slug.startswith('#'):
293 page_slug = page_name2 + page_slug
294 else:
295 page_slug = page_name2 + '/' + page_slug
296
297 # If this is a bare section link, build URL based on this page
298 if page_slug.startswith('#'):
299 iw_url = onigalore_url + page_name2
300 iw_found += 1
301 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
302 found_iw_match = True
303 page_slug = page_name2 + page_slug
304
305 # If there's no ":" in the link (before the section link, where a colon would just be
306 # part of the text) then it's a Main namespace article; proceed with building URL
307 if found_iw_match == False:
308 if not re.search(":.*#", page_slug):
309 iw_url = onigalore_url + page_slug
310 iw_found += 1
311 if debug: pywikibot.stdout(' Link is to a Main namespace page.')
312 found_iw_match = True
313
314 # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
315 # before building URL
316 if found_iw_match == False:
317 for prefix in intrawiki_prefixes:
318 if prefix + ":" in page_slug:
319 iw_url = onigalore_url + page_slug
320 if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
321 iw_found += 1
322 found_iw_match = True
323 break
324
325 # If we still haven't turned this match into a URL, something's gone wrong
326 if (found_iw_match == False) or (iw_url == ""):
327 possibly_print(page_name)
328 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
329 continue
330
331 # Test the URL
332 iw_url = iw_url.replace(' ', '_')
333 if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
334 test_intrawiki_link(iw_url, page_name, page_slug)
335
336# Print a wrap-up message
337def print_summary():
338 global pages_checked
339 global iw_found
340 global advice_issued
341 global errors_issued
342
343 page_str = "pages"
344 if pages_checked == 1:
345 page_str = "page"
346
347 link_str = "links"
348 if iw_found == 1:
349 link_str = "link"
350
351 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
352 pywikibot.stdout('While attempting to follow section links….')
353
354 if advice_issued == 0:
355 pywikibot.stdout(' No advice on potential problems was issued.')
356 elif advice_issued == 1:
357 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
358 else:
359 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
360
361 error_str = "errors were"
362 if errors_issued == 1:
363 error_str = "error was"
364 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
365
366# Main function
367def main(*args):
368 global debug
369 search_cat = ''
370 search_page = ''
371
372 # Process arguments
373 local_args = pywikibot.handle_args(args)
374 for arg in local_args:
375 if arg.startswith('-cat:'):
376 search_cat = arg[5:]
377 elif arg.startswith('-page:'):
378 search_page = arg[6:]
379 elif arg == '-dbg':
380 debug = 1
381 else:
382 pywikibot.stdout('Unknown argument "{}".'.format(arg))
383 return
384
385 site = pywikibot.Site()
386
387 # This line of code enumerates the methods in the 'page' class
388 #pywikibot.stdout(format(dir(page)))
389
390 # Check specified page or loop through specified category and check all pages
391 if search_cat != '':
392 cat_obj = pywikibot.Category(site, search_cat)
393 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
394 for page in pagegenerators.PreloadingGenerator(generator, 100):
395 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
396 scan_for_intrawiki_links(page.text, page.title())
397 elif search_page != '':
398 page = pywikibot.Page(site, search_page)
399 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
400 scan_for_intrawiki_links(page.text, page.title())
401
402 # Print the results
403 print_summary()
404
405if __name__ == '__main__':
406 main()
Note: See TracBrowser for help on using the repository browser.