source: ValBot/Python/check_interwiki_links.py@ 1196

Last change on this file since 1196 was 1196, checked in by iritscen, 27 hours ago

ValBot: check_interwiki_links.sh now tallies and more clearly marks redirects that are probably not intended. Redirect target page is now correctly stated in one message about redirects. Streamlined code somewhat.

File size: 13.2 KB
Line 
1# Check Interwiki Links
2# by iritscen@yahoo.com
3# Looks at each link on a page (or all the pages in a category) which uses a registered
4# interwiki prefix and loads the linked page, verifying that it exists and that any section
5# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
6# validate the interwiki link.
7# Recommended viewing width:
8# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
9
10import bs4
11import pywikibot
12import re
13import requests # for listing members with dir() when debugging
14
15from bs4 import BeautifulSoup
16from pywikibot import pagegenerators
17from pywikibot.bot import QuitKeyboardInterrupt
18from pywikibot.comms.http import fetch
19from pywikibot.specialbots import UploadRobot
20from pywikibot.tools.formatter import color_format
21from urllib.parse import urljoin
22
23class IWLink:
24 def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response):
25 self.iw_prefix = iw_prefix # e.g. "wp"
26 self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
27 self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg"
28 self.page_name = page_name # "Easter egg"
29 self.page_slug = page_slug # "Easter_egg"
30 self.curl_response = curl_response # a class defined in the Requests library
31
32# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37# Initialize globals
38debug = 0
39pages_checked = 0
40iw_found = 0
41errors_issued = 0
42unintended_redirects_found = 0
43name_printed = 0
44
45# Prints the name of a page on which something occurred, if it has not been printed before
46def possibly_print(page_name):
47 global debug
48 global name_printed
49
50 if not name_printed and not debug:
51 pywikibot.stdout('')
52 pywikibot.stdout('From page "{}":'.format(page_name))
53 name_printed = 1
54
55# Search a page for the section specified in the link
56def find_section(the_link, print_result):
57 global errors_issued
58
59 # Isolate section link
60 target_page_name, anchor_name = the_link.page_slug.split('#')
61 target_page_name_human = target_page_name.replace('_', ' ')
62
63 # Convert dot-notation hex entities to proper characters
64 replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
65 for pattern, replacement in replacements:
66 anchor_name = re.sub(pattern, replacement, anchor_name)
67
68 # Read linked page to see if it really has this anchor link
69 soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
70 tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
71 found_section = False
72 for tag_name in tags_to_search:
73 for the_tag in soup.find_all(tag_name):
74 if the_tag.get('id') == anchor_name:
75 found_section = True
76 break
77 if found_section:
78 break
79
80 # Tell user what we found
81 if found_section == False:
82 possibly_print(the_link.page_name)
83 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
84 # TODO: Check that page name has been corrected to redirected page if there was a redirect
85 errors_issued = errors_issued + 1
86 elif print_result == True:
87 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
88
89# For a link that redirected us to another page, extract the name of the target page from
90# the target page's source
91def find_canonical_link(the_link):
92 # Extract link from this markup which contains name of redirected-to page:
93 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
94 canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
95 prefix_length = len(the_link.prefix_url)
96 canonical_name = canonical_name[prefix_length:]
97 tag_end = canonical_name.find('">')
98
99 if tag_end == -1:
100 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
101 errors_issued = errors_issued + 1
102 else:
103 canonical_name = canonical_name[:tag_end]
104 if len(canonical_name) > 100:
105 # Certain things can cause the trim to fail; report error and avoid slamming the
106 # output with massive page source from a failed trim
107 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
108 errors_issued = errors_issued + 1
109 else:
110 canonical_name = canonical_name.replace('_', ' ')
111 if '#' in the_link.page_slug:
112 _, anchor_name = the_link.page_slug.split('#')
113 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name))
114 the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages
115 find_section(the_link, True)
116 else:
117 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))
118
119# Test an interwiki link and look for a section link if applicable
120def test_interwiki_link(the_link):
121 global errors_issued
122 global unintended_redirects_found
123
124 the_link.curl_response = fetch(the_link.full_url)
125
126 # One way we tell that a redirect occurred is by checking fetch's history, as it
127 # automatically follows redirects. This will catch formal redirects which come from pages
128 # such as Special:PermanentLink.
129 if the_link.curl_response.history != []:
130 possibly_print(the_link.page_name)
131
132 # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
133 if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
134 pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
135 find_canonical_link(the_link)
136 else:
137 permalink1 = 'Special:PermanentLink/'.lower()
138 permalink2 = 'Special:Permalink/'.lower()
139 page_slug_lower = the_link.page_slug.lower()
140 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
141 pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
142 find_canonical_link(the_link)
143 else:
144 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
145 errors_issued = errors_issued + 1
146 elif the_link.curl_response.status_code != 200:
147 possibly_print(the_link.page_name)
148 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
149 errors_issued = errors_issued + 1
150 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
151 # using JavaScript, while returning code OK 200 as if the link was correct; this happens
152 # when a redirect page is accessed. We must detect these soft redirects by looking at the
153 # page source to find the redirect note inserted at the top of the page for the reader.
154 elif 'Redirected from <a' in the_link.curl_response.text:
155 unintended_redirects_found = unintended_redirects_found + 1
156 possibly_print(the_link.page_name)
157 pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
158 find_canonical_link(the_link)
159 elif '#' in the_link.page_slug:
160 find_section(the_link, False)
161
162# Searches the given page text for interwiki links
163def scan_for_interwiki_links(page_text, page_name):
164 global debug
165 global pages_checked
166 global iw_found
167 global name_printed
168 pages_checked = pages_checked + 1
169 cur_prefix = 0
170 name_printed = 0
171
172 for prefix in interwiki_prefixes:
173 # Isolate strings that start with "[[prefix:" and end with "|" or "]"
174 iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])"
175 for match in re.finditer(iw_link, page_text):
176 the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "")
177
178 # Extract just the page title from this regex match
179 s = match.start() + 2 + len(the_link.iw_prefix) + 1
180 e = match.end() - 1
181
182 # Commonly we use spaces instead of underscores, so fix that before querying
183 the_link.page_slug = page_text[s:e].replace(' ', '_')
184
185 # But use spaces for title when printing it
186 page_title_human = the_link.page_slug.replace('_', ' ')
187 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human))
188 iw_found = iw_found + 1
189
190 # Construct full URL for the particular wiki
191 the_link.full_url = the_link.prefix_url + the_link.page_slug
192
193 # Adjust URL if this is a foreign-language WP link
194 if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
195 lang_code = the_link.page_slug[0:2] + "."
196 # "wp:" is the Wikipedia: namespace, not a language
197 if lang_code != "wp." and lang_code != "WP.":
198 the_link.full_url = the_link.full_url.replace('en.', lang_code)
199 the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
200
201 # Test the URL
202 test_interwiki_link(the_link)
203 cur_prefix = cur_prefix + 1
204
205# Print a wrap-up message
206def print_summary():
207 global pages_checked
208 global iw_found
209 global errors_issued
210 global unintended_redirects_found
211
212 page_str = "pages"
213 if pages_checked == 1:
214 page_str = "page"
215
216 link_str = "links"
217 if iw_found == 1:
218 link_str = "link"
219
220 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
221
222 error_str = "errors were"
223 if errors_issued == 1:
224 error_str = "error was"
225
226 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
227
228 warning_str = "likely-unintended redirects were"
229 if unintended_redirects_found == 1:
230 warning_str = "likely-unintended redirect was"
231
232 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
233
234# Main function
235def main(*args):
236 global debug
237 search_cat = ''
238 search_page = ''
239
240 # Process arguments
241 local_args = pywikibot.handle_args(args)
242 for arg in local_args:
243 if arg.startswith('-cat:'):
244 search_cat = arg[5:]
245 elif arg.startswith('-page:'):
246 search_page = arg[6:]
247 elif arg == '-dbg':
248 debug = 1
249 else:
250 pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
251 return
252
253 #pywikibot.stdout('The members of the requests.models.Response class are:')
254 #pywikibot.stdout(format(dir(requests.models.Response)))
255 #return
256
257 # Check specified page or loop through specified category and check all pages
258 site = pywikibot.Site()
259 if search_cat != '':
260 cat_obj = pywikibot.Category(site, search_cat)
261 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
262 for page in pagegenerators.PreloadingGenerator(generator, 100):
263 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
264 scan_for_interwiki_links(page.text, page.title())
265 elif search_page != '':
266 page = pywikibot.Page(site, search_page)
267 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
268 scan_for_interwiki_links(page.text, page.title())
269
270 # Print the results
271 print_summary()
272
273if __name__ == '__main__':
274 main()
Note: See TracBrowser for help on using the repository browser.