source: ValBot/Python/check_interwiki_links.py@ 1205

Last change on this file since 1205 was 1204, checked in by iritscen, 3 months ago

ValBot: Removed import that was causing trouble with current version of Pywikibot; it wasn't being used anyway.

File size: 13.0 KB
Line 
1# Check Interwiki Links
2# by iritscen@yahoo.com
3# Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that
4# any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link.
5# Recommended viewing width:
6# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----|
7
8import bs4
9import pywikibot
10import re
11import requests # for listing members with dir() when debugging
12
13from bs4 import BeautifulSoup
14from pywikibot import pagegenerators
15from pywikibot.bot import QuitKeyboardInterrupt
16from pywikibot.comms.http import fetch
17from pywikibot.specialbots import UploadRobot
18from urllib.parse import urljoin
19
20class IWLink:
21 def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response):
22 self.iw_prefix = iw_prefix # e.g. "wp"
23 self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
24 self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy"
25 self.page_name = page_name # "Marathon (series)#Rampancy"
26 self.page_name_only = page_name # "Marathon (series)"
27 self.page_slug = page_slug # "Marathon_(series)#Rampancy"
28 self.hosting_page = hosting_page # "Easter eggs"; page where the link was found
29 self.curl_response = curl_response # a class defined in the Requests library
30
31# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
32interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
33
34interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
35
36# Initialize globals
37debug = 0
38pages_checked = 0
39iw_found = 0
40errors_issued = 0
41unintended_redirects_found = 0
42name_printed = 0
43
44# Prints the name of a page on which something occurred, if it has not been printed before
45def possibly_print(the_link):
46 global debug
47 global name_printed
48
49 if not name_printed and not debug:
50 pywikibot.stdout('')
51 pywikibot.stdout('From page "{}":'.format(the_link.hosting_page))
52 name_printed = 1
53
54# Search a page for the section specified in the link
55def find_section(the_link, print_result):
56 global errors_issued
57
58 # Isolate section link
59 _, anchor_name = the_link.page_slug.split('#')
60
61 # Convert dot-notation hex entities to proper characters
62 replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
63 for pattern, replacement in replacements:
64 anchor_name = re.sub(pattern, replacement, anchor_name)
65
66 # Read linked page to see if it really has this anchor link
67 soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
68 tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
69 found_section = False
70 for tag_name in tags_to_search:
71 for the_tag in soup.find_all(tag_name):
72 if the_tag.get('id') == anchor_name:
73 found_section = True
74 break
75 if found_section:
76 break
77
78 # Tell user what we found
79 if found_section == False:
80 possibly_print(the_link)
81 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
82 errors_issued = errors_issued + 1
83 elif print_result == True:
84 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
85
86# For a link that redirected us to another page, extract the name of the target page from the target page's source
87def find_canonical_link(the_link):
88 # Extract link from this markup which contains name of redirected-to page:
89 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
90 canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
91 prefix_length = len(the_link.prefix_url)
92 canonical_name = canonical_name[prefix_length:]
93 tag_end = canonical_name.find('">')
94
95 if tag_end == -1:
96 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
97 errors_issued = errors_issued + 1
98 else:
99 canonical_name = canonical_name[:tag_end]
100 if len(canonical_name) > 100:
101 # Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim
102 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
103 errors_issued = errors_issued + 1
104 else:
105 the_link.page_name = canonical_name.replace('_', ' ')
106 if '#' in the_link.page_slug:
107 the_link.page_name_only, _ = the_link.page_slug.split('#')
108 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name))
109 find_section(the_link, True)
110 else:
111 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name))
112
113# Test an interwiki link and look for a section link if applicable
114def test_interwiki_link(the_link):
115 global errors_issued
116 global unintended_redirects_found
117
118 the_link.curl_response = fetch(the_link.full_url)
119
120 # One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from
121 # pages such as Special:PermanentLink.
122 if the_link.curl_response.history != []:
123 possibly_print(the_link)
124
125 # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
126 if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
127 pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
128 find_canonical_link(the_link)
129 else:
130 permalink1 = 'Special:PermanentLink/'.lower()
131 permalink2 = 'Special:Permalink/'.lower()
132 page_slug_lower = the_link.page_slug.lower()
133 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
134 pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
135 find_canonical_link(the_link)
136 else:
137 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
138 errors_issued = errors_issued + 1
139 elif the_link.curl_response.status_code != 200:
140 possibly_print(the_link)
141 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
142 errors_issued = errors_issued + 1
143 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this
144 # happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the
145 # page for the reader.
146 elif 'Redirected from <a' in the_link.curl_response.text:
147 unintended_redirects_found = unintended_redirects_found + 1
148 possibly_print(the_link)
149 pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
150 find_canonical_link(the_link) # calls find_section() at end
151 elif '#' in the_link.page_slug:
152 find_section(the_link, False)
153
154# Searches the given page text for interwiki links
155def scan_for_interwiki_links(page_text, page_name):
156 global debug
157 global pages_checked
158 global iw_found
159 global name_printed
160 pages_checked = pages_checked + 1
161 cur_prefix = 0
162 name_printed = 0
163
164 for prefix in interwiki_prefixes:
165 # Isolate strings that start with "[[prefix:" and end with "|" or "]"
166 iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])"
167 for match in re.finditer(iw_link, page_text):
168 the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", "", "", "", page_name, "")
169
170 # Extract just the page title from this regex match
171 s = match.start() + 2 + len(the_link.iw_prefix) + 1
172 e = match.end() - 1
173
174 # Use underscores in slug used to constructed URL, but retain spaces for printable name
175 the_link.page_slug = page_text[s:e].replace(' ', '_')
176 the_link.page_name = page_text[s:e]
177 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, the_link.page_name))
178 iw_found = iw_found + 1
179
180 # Construct full URL for the particular wiki
181 the_link.full_url = the_link.prefix_url + the_link.page_slug
182
183 # Adjust URL if this is a foreign-language WP link
184 if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
185 lang_code = the_link.page_slug[0:2] + "."
186 # "wp:" is the Wikipedia: namespace, not a language
187 if lang_code != "wp." and lang_code != "WP.":
188 the_link.full_url = the_link.full_url.replace('en.', lang_code)
189 the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
190
191 # Test the URL
192 test_interwiki_link(the_link)
193 cur_prefix = cur_prefix + 1
194
195# Print a wrap-up message
196def print_summary():
197 global pages_checked
198 global iw_found
199 global errors_issued
200 global unintended_redirects_found
201
202 page_str = "pages"
203 if pages_checked == 1:
204 page_str = "page"
205
206 link_str = "links"
207 if iw_found == 1:
208 link_str = "link"
209
210 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
211
212 error_str = "errors were"
213 if errors_issued == 1:
214 error_str = "error was"
215
216 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
217
218 warning_str = "likely-unintended redirects were"
219 if unintended_redirects_found == 1:
220 warning_str = "likely-unintended redirect was"
221
222 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
223
224# Main function
225def main(*args):
226 global debug
227 search_cat = ''
228 search_page = ''
229
230 # Process arguments
231 local_args = pywikibot.handle_args(args)
232 for arg in local_args:
233 if arg.startswith('-cat:'):
234 search_cat = arg[5:]
235 elif arg.startswith('-page:'):
236 search_page = arg[6:]
237 elif arg == '-dbg':
238 debug = 1
239 else:
240 pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
241 return
242
243 #pywikibot.stdout('The members of the requests.models.Response class are:')
244 #pywikibot.stdout(format(dir(requests.models.Response)))
245 #return
246
247 # Check specified page or loop through specified category and check all pages
248 site = pywikibot.Site()
249 if search_cat != '':
250 cat_obj = pywikibot.Category(site, search_cat)
251 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
252 for page in pagegenerators.PreloadingGenerator(generator, 100):
253 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
254 scan_for_interwiki_links(page.text, page.title())
255 elif search_page != '':
256 page = pywikibot.Page(site, search_page)
257 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
258 scan_for_interwiki_links(page.text, page.title())
259
260 # Print the results
261 print_summary()
262
263if __name__ == '__main__':
264 main()
Note: See TracBrowser for help on using the repository browser.