source: ValBot/Python/check_interwiki_links.py@ 1211

Last change on this file since 1211 was 1207, checked in by iritscen, 5 weeks ago

ValBot: Added throttle to check_interwiki_links.py to avoid Wikipedia kicking us out with error 429.

File size: 13.8 KB
Line 
1# Check Interwiki Links
2# by iritscen@yahoo.com
3# Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that
4# any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link.
5# Recommended viewing width:
6# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----|
7
8import bs4
9import pywikibot
10import re
11import requests # for listing members with dir() when debugging
12import time
13
14from bs4 import BeautifulSoup
15from pywikibot import pagegenerators
16from pywikibot.bot import QuitKeyboardInterrupt
17from pywikibot.comms.http import fetch
18from pywikibot.specialbots import UploadRobot
19from urllib.parse import urljoin
20
21class IWLink:
22 def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response):
23 self.iw_prefix = iw_prefix # e.g. "wp" as in [[wp:Marathon (series)#Rampancy]]
24 self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
25 self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy"
26 self.page_name = page_name # "Marathon (series)#Rampancy"
27 self.page_name_only = page_name # "Marathon (series)"
28 self.page_slug = page_slug # "Marathon_(series)#Rampancy"
29 self.hosting_page = hosting_page # "Easter eggs"; page where the link was found
30 self.curl_response = curl_response # a class defined in the Requests library
31
32# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37# Initialize globals
38debug = 0
39pages_checked = 0
40iw_found = 0
41errors_issued = 0
42unintended_redirects_found = 0
43name_printed = 0
44request_delay = 1.5
45max_retries = 3
46backoff_factor = 2
47
48# Prints the name of a page on which something occurred, if it has not been printed before
49def possibly_print(the_link):
50 global debug
51 global name_printed
52
53 if not name_printed and not debug:
54 pywikibot.stdout('')
55 pywikibot.stdout('From page "{}":'.format(the_link.hosting_page))
56 name_printed = 1
57
58# Search a page for the section specified in the link
59def find_section(the_link, print_result):
60 global errors_issued
61
62 # Isolate section link
63 _, anchor_name = the_link.page_slug.split('#')
64
65 # Convert dot-notation hex entities to proper characters
66 replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
67 for pattern, replacement in replacements:
68 anchor_name = re.sub(pattern, replacement, anchor_name)
69
70 # Read linked page to see if it really has this anchor link
71 soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
72 tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
73 found_section = False
74 for tag_name in tags_to_search:
75 for the_tag in soup.find_all(tag_name):
76 if the_tag.get('id') == anchor_name:
77 found_section = True
78 break
79 if found_section:
80 break
81
82 # Tell user what we found
83 if found_section == False:
84 possibly_print(the_link)
85 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
86 errors_issued = errors_issued + 1
87 elif print_result == True:
88 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
89
90# For a link that redirected us to another page, extract the name of the target page from the target page's source
91def find_canonical_link(the_link):
92 # Extract link from this markup which contains name of redirected-to page:
93 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
94 canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
95 prefix_length = len(the_link.prefix_url)
96 canonical_name = canonical_name[prefix_length:]
97 tag_end = canonical_name.find('">')
98
99 if tag_end == -1:
100 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
101 errors_issued = errors_issued + 1
102 else:
103 canonical_name = canonical_name[:tag_end]
104 if len(canonical_name) > 100:
105 # Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim
106 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
107 errors_issued = errors_issued + 1
108 else:
109 the_link.page_name = canonical_name.replace('_', ' ')
110 if '#' in the_link.page_slug:
111 the_link.page_name_only, _ = the_link.page_slug.split('#')
112 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name))
113 find_section(the_link, True)
114 else:
115 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name))
116
117# Test an interwiki link and look for a section link if applicable
118def test_interwiki_link(the_link):
119 global errors_issued
120 global unintended_redirects_found
121
122 # We have to carefully throttle requests because otherwise we will get hit with a 429: Too Many Requests
123 attempt = 0
124 delay = request_delay
125 while True:
126 time.sleep(delay)
127
128 the_link.curl_response = fetch(the_link.full_url)
129
130 if the_link.curl_response.status_code != 429:
131 break
132
133 attempt += 1
134 if attempt > max_retries:
135 pywikibot.stdout(f' ERROR: Maximum retries afer error 429 exceeded for "{the_link.page_slug}". Aborting script.')
136 raise SystemExit(1)
137
138 # Increase rate limit if we got the error
139 delay *= backoff_factor
140 pywikibot.stdout(f' WARNING: Received error 429 for "{the_link.page_slug}". Retrying in {delay:.1f}s...')
141
142 # One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from
143 # pages such as Special:PermanentLink.
144 if the_link.curl_response.history != []:
145 possibly_print(the_link)
146
147 # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
148 if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
149 pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
150 find_canonical_link(the_link)
151 else:
152 permalink1 = 'Special:PermanentLink/'.lower()
153 permalink2 = 'Special:Permalink/'.lower()
154 page_slug_lower = the_link.page_slug.lower()
155 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
156 pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
157 find_canonical_link(the_link)
158 else:
159 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
160 errors_issued = errors_issued + 1
161 elif the_link.curl_response.status_code != 200:
162 possibly_print(the_link)
163 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
164 errors_issued = errors_issued + 1
165 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this
166 # happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the
167 # page for the reader.
168 elif 'Redirected from <a' in the_link.curl_response.text:
169 unintended_redirects_found = unintended_redirects_found + 1
170 possibly_print(the_link)
171 pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
172 find_canonical_link(the_link) # calls find_section() at end
173 elif '#' in the_link.page_slug:
174 find_section(the_link, False)
175
176# Searches the given page text for interwiki links
177def scan_for_interwiki_links(page_text, page_name):
178 global debug
179 global pages_checked
180 global iw_found
181 global name_printed
182 pages_checked = pages_checked + 1
183 cur_prefix = 0
184 name_printed = 0
185
186 for prefix in interwiki_prefixes:
187 # Isolate strings that start with "[[prefix:" and end with "|" or "]"
188 iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])"
189 for match in re.finditer(iw_link, page_text):
190 the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", "", "", "", page_name, "")
191
192 # Extract just the page title from this regex match
193 s = match.start() + 2 + len(the_link.iw_prefix) + 1
194 e = match.end() - 1
195
196 # Use underscores in slug used to constructed URL, but retain spaces for printable name
197 the_link.page_slug = page_text[s:e].replace(' ', '_')
198 the_link.page_name = page_text[s:e]
199 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, the_link.page_name))
200 iw_found = iw_found + 1
201
202 # Construct full URL for the particular wiki
203 the_link.full_url = the_link.prefix_url + the_link.page_slug
204
205 # Adjust URL if this is a foreign-language WP link
206 if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
207 lang_code = the_link.page_slug[0:2] + "."
208 # "wp:" is the Wikipedia: namespace, not a language
209 if lang_code != "wp." and lang_code != "WP.":
210 the_link.full_url = the_link.full_url.replace('en.', lang_code)
211 the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
212
213 # Test the URL
214 test_interwiki_link(the_link)
215 cur_prefix = cur_prefix + 1
216
217# Print a wrap-up message
218def print_summary():
219 global pages_checked
220 global iw_found
221 global errors_issued
222 global unintended_redirects_found
223
224 page_str = "pages"
225 if pages_checked == 1:
226 page_str = "page"
227
228 link_str = "links"
229 if iw_found == 1:
230 link_str = "link"
231
232 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
233
234 error_str = "errors were"
235 if errors_issued == 1:
236 error_str = "error was"
237
238 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
239
240 warning_str = "likely-unintended redirects were"
241 if unintended_redirects_found == 1:
242 warning_str = "likely-unintended redirect was"
243
244 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
245
246# Main function
247def main(*args):
248 global debug
249 search_cat = ''
250 search_page = ''
251
252 # Process arguments
253 local_args = pywikibot.handle_args(args)
254 for arg in local_args:
255 if arg.startswith('-cat:'):
256 search_cat = arg[5:]
257 elif arg.startswith('-page:'):
258 search_page = arg[6:]
259 elif arg == '-dbg':
260 debug = 1
261 else:
262 pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
263 return
264
265 #pywikibot.stdout('The members of the requests.models.Response class are:')
266 #pywikibot.stdout(format(dir(requests.models.Response)))
267 #return
268
269 # Check specified page or loop through specified category and check all pages
270 site = pywikibot.Site()
271 if search_cat != '':
272 cat_obj = pywikibot.Category(site, search_cat)
273 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
274 for page in pagegenerators.PreloadingGenerator(generator, 100):
275 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
276 scan_for_interwiki_links(page.text, page.title())
277 elif search_page != '':
278 page = pywikibot.Page(site, search_page)
279 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
280 scan_for_interwiki_links(page.text, page.title())
281
282 # Print the results
283 print_summary()
284
285if __name__ == '__main__':
286 main()
Note: See TracBrowser for help on using the repository browser.