source: ValBot/Python/check_interwiki_links.py@ 1197

Last change on this file since 1197 was 1197, checked in by iritscen, 17 hours ago

ValBot: Removed TODO comment from check_interwiki_links.sh which was completed in the last commit.

File size: 13.1 KB
Line 
1# Check Interwiki Links
2# by iritscen@yahoo.com
3# Looks at each link on a page (or all the pages in a category) which uses a registered
4# interwiki prefix and loads the linked page, verifying that it exists and that any section
5# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
6# validate the interwiki link.
7# Recommended viewing width:
8# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
9
10import bs4
11import pywikibot
12import re
13import requests # for listing members with dir() when debugging
14
15from bs4 import BeautifulSoup
16from pywikibot import pagegenerators
17from pywikibot.bot import QuitKeyboardInterrupt
18from pywikibot.comms.http import fetch
19from pywikibot.specialbots import UploadRobot
20from pywikibot.tools.formatter import color_format
21from urllib.parse import urljoin
22
23class IWLink:
24 def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response):
25 self.iw_prefix = iw_prefix # e.g. "wp"
26 self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
27 self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg"
28 self.page_name = page_name # "Easter egg"
29 self.page_slug = page_slug # "Easter_egg"
30 self.curl_response = curl_response # a class defined in the Requests library
31
32# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37# Initialize globals
38debug = 0
39pages_checked = 0
40iw_found = 0
41errors_issued = 0
42unintended_redirects_found = 0
43name_printed = 0
44
45# Prints the name of a page on which something occurred, if it has not been printed before
46def possibly_print(page_name):
47 global debug
48 global name_printed
49
50 if not name_printed and not debug:
51 pywikibot.stdout('')
52 pywikibot.stdout('From page "{}":'.format(page_name))
53 name_printed = 1
54
55# Search a page for the section specified in the link
56def find_section(the_link, print_result):
57 global errors_issued
58
59 # Isolate section link
60 target_page_name, anchor_name = the_link.page_slug.split('#')
61 target_page_name_human = target_page_name.replace('_', ' ')
62
63 # Convert dot-notation hex entities to proper characters
64 replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
65 for pattern, replacement in replacements:
66 anchor_name = re.sub(pattern, replacement, anchor_name)
67
68 # Read linked page to see if it really has this anchor link
69 soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
70 tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
71 found_section = False
72 for tag_name in tags_to_search:
73 for the_tag in soup.find_all(tag_name):
74 if the_tag.get('id') == anchor_name:
75 found_section = True
76 break
77 if found_section:
78 break
79
80 # Tell user what we found
81 if found_section == False:
82 possibly_print(the_link.page_name)
83 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
84 errors_issued = errors_issued + 1
85 elif print_result == True:
86 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
87
88# For a link that redirected us to another page, extract the name of the target page from
89# the target page's source
90def find_canonical_link(the_link):
91 # Extract link from this markup which contains name of redirected-to page:
92 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
93 canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
94 prefix_length = len(the_link.prefix_url)
95 canonical_name = canonical_name[prefix_length:]
96 tag_end = canonical_name.find('">')
97
98 if tag_end == -1:
99 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
100 errors_issued = errors_issued + 1
101 else:
102 canonical_name = canonical_name[:tag_end]
103 if len(canonical_name) > 100:
104 # Certain things can cause the trim to fail; report error and avoid slamming the
105 # output with massive page source from a failed trim
106 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
107 errors_issued = errors_issued + 1
108 else:
109 canonical_name = canonical_name.replace('_', ' ')
110 if '#' in the_link.page_slug:
111 _, anchor_name = the_link.page_slug.split('#')
112 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name))
113 the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages
114 find_section(the_link, True)
115 else:
116 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))
117
118# Test an interwiki link and look for a section link if applicable
119def test_interwiki_link(the_link):
120 global errors_issued
121 global unintended_redirects_found
122
123 the_link.curl_response = fetch(the_link.full_url)
124
125 # One way we tell that a redirect occurred is by checking fetch's history, as it
126 # automatically follows redirects. This will catch formal redirects which come from pages
127 # such as Special:PermanentLink.
128 if the_link.curl_response.history != []:
129 possibly_print(the_link.page_name)
130
131 # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
132 if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
133 pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
134 find_canonical_link(the_link)
135 else:
136 permalink1 = 'Special:PermanentLink/'.lower()
137 permalink2 = 'Special:Permalink/'.lower()
138 page_slug_lower = the_link.page_slug.lower()
139 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
140 pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
141 find_canonical_link(the_link)
142 else:
143 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
144 errors_issued = errors_issued + 1
145 elif the_link.curl_response.status_code != 200:
146 possibly_print(the_link.page_name)
147 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
148 errors_issued = errors_issued + 1
149 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
150 # using JavaScript, while returning code OK 200 as if the link was correct; this happens
151 # when a redirect page is accessed. We must detect these soft redirects by looking at the
152 # page source to find the redirect note inserted at the top of the page for the reader.
153 elif 'Redirected from <a' in the_link.curl_response.text:
154 unintended_redirects_found = unintended_redirects_found + 1
155 possibly_print(the_link.page_name)
156 pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
157 find_canonical_link(the_link)
158 elif '#' in the_link.page_slug:
159 find_section(the_link, False)
160
161# Searches the given page text for interwiki links
162def scan_for_interwiki_links(page_text, page_name):
163 global debug
164 global pages_checked
165 global iw_found
166 global name_printed
167 pages_checked = pages_checked + 1
168 cur_prefix = 0
169 name_printed = 0
170
171 for prefix in interwiki_prefixes:
172 # Isolate strings that start with "[[prefix:" and end with "|" or "]"
173 iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])"
174 for match in re.finditer(iw_link, page_text):
175 the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "")
176
177 # Extract just the page title from this regex match
178 s = match.start() + 2 + len(the_link.iw_prefix) + 1
179 e = match.end() - 1
180
181 # Commonly we use spaces instead of underscores, so fix that before querying
182 the_link.page_slug = page_text[s:e].replace(' ', '_')
183
184 # But use spaces for title when printing it
185 page_title_human = the_link.page_slug.replace('_', ' ')
186 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human))
187 iw_found = iw_found + 1
188
189 # Construct full URL for the particular wiki
190 the_link.full_url = the_link.prefix_url + the_link.page_slug
191
192 # Adjust URL if this is a foreign-language WP link
193 if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
194 lang_code = the_link.page_slug[0:2] + "."
195 # "wp:" is the Wikipedia: namespace, not a language
196 if lang_code != "wp." and lang_code != "WP.":
197 the_link.full_url = the_link.full_url.replace('en.', lang_code)
198 the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
199
200 # Test the URL
201 test_interwiki_link(the_link)
202 cur_prefix = cur_prefix + 1
203
204# Print a wrap-up message
205def print_summary():
206 global pages_checked
207 global iw_found
208 global errors_issued
209 global unintended_redirects_found
210
211 page_str = "pages"
212 if pages_checked == 1:
213 page_str = "page"
214
215 link_str = "links"
216 if iw_found == 1:
217 link_str = "link"
218
219 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
220
221 error_str = "errors were"
222 if errors_issued == 1:
223 error_str = "error was"
224
225 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
226
227 warning_str = "likely-unintended redirects were"
228 if unintended_redirects_found == 1:
229 warning_str = "likely-unintended redirect was"
230
231 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
232
233# Main function
234def main(*args):
235 global debug
236 search_cat = ''
237 search_page = ''
238
239 # Process arguments
240 local_args = pywikibot.handle_args(args)
241 for arg in local_args:
242 if arg.startswith('-cat:'):
243 search_cat = arg[5:]
244 elif arg.startswith('-page:'):
245 search_page = arg[6:]
246 elif arg == '-dbg':
247 debug = 1
248 else:
249 pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
250 return
251
252 #pywikibot.stdout('The members of the requests.models.Response class are:')
253 #pywikibot.stdout(format(dir(requests.models.Response)))
254 #return
255
256 # Check specified page or loop through specified category and check all pages
257 site = pywikibot.Site()
258 if search_cat != '':
259 cat_obj = pywikibot.Category(site, search_cat)
260 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
261 for page in pagegenerators.PreloadingGenerator(generator, 100):
262 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
263 scan_for_interwiki_links(page.text, page.title())
264 elif search_page != '':
265 page = pywikibot.Page(site, search_page)
266 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
267 scan_for_interwiki_links(page.text, page.title())
268
269 # Print the results
270 print_summary()
271
272if __name__ == '__main__':
273 main()
Note: See TracBrowser for help on using the repository browser.