source: ValBot/Python/check_interwiki_links.py@ 1200

Last change on this file since 1200 was 1200, checked in by iritscen, 3 days ago

Reverted last commit now that test is done.

File size: 13.0 KB
Line 
1# Check Interwiki Links
2# by iritscen@yahoo.com
3# Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that
4# any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link.
5# Recommended viewing width:
6# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----|
7
8import bs4
9import pywikibot
10import re
11import requests # for listing members with dir() when debugging
12
13from bs4 import BeautifulSoup
14from pywikibot import pagegenerators
15from pywikibot.bot import QuitKeyboardInterrupt
16from pywikibot.comms.http import fetch
17from pywikibot.specialbots import UploadRobot
18from pywikibot.tools.formatter import color_format
19from urllib.parse import urljoin
20
21class IWLink:
22 def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response):
23 self.iw_prefix = iw_prefix # e.g. "wp"
24 self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
25 self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy"
26 self.page_name = page_name # "Marathon (series)#Rampancy"
27 self.page_name_only = page_name # "Marathon (series)"
28 self.page_slug = page_slug # "Marathon_(series)#Rampancy"
29 self.hosting_page = hosting_page # "Easter eggs"; page where the link was found
30 self.curl_response = curl_response # a class defined in the Requests library
31
32# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37# Initialize globals
38debug = 0
39pages_checked = 0
40iw_found = 0
41errors_issued = 0
42unintended_redirects_found = 0
43name_printed = 0
44
45# Prints the name of a page on which something occurred, if it has not been printed before
46def possibly_print(the_link):
47 global debug
48 global name_printed
49
50 if not name_printed and not debug:
51 pywikibot.stdout('')
52 pywikibot.stdout('From page "{}":'.format(the_link.hosting_page))
53 name_printed = 1
54
55# Search a page for the section specified in the link
56def find_section(the_link, print_result):
57 global errors_issued
58
59 # Isolate section link
60 _, anchor_name = the_link.page_slug.split('#')
61
62 # Convert dot-notation hex entities to proper characters
63 replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
64 for pattern, replacement in replacements:
65 anchor_name = re.sub(pattern, replacement, anchor_name)
66
67 # Read linked page to see if it really has this anchor link
68 soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
69 tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
70 found_section = False
71 for tag_name in tags_to_search:
72 for the_tag in soup.find_all(tag_name):
73 if the_tag.get('id') == anchor_name:
74 found_section = True
75 break
76 if found_section:
77 break
78
79 # Tell user what we found
80 if found_section == False:
81 possibly_print(the_link)
82 pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
83 errors_issued = errors_issued + 1
84 elif print_result == True:
85 pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
86
87# For a link that redirected us to another page, extract the name of the target page from the target page's source
88def find_canonical_link(the_link):
89 # Extract link from this markup which contains name of redirected-to page:
90 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
91 canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
92 prefix_length = len(the_link.prefix_url)
93 canonical_name = canonical_name[prefix_length:]
94 tag_end = canonical_name.find('">')
95
96 if tag_end == -1:
97 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
98 errors_issued = errors_issued + 1
99 else:
100 canonical_name = canonical_name[:tag_end]
101 if len(canonical_name) > 100:
102 # Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim
103 pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
104 errors_issued = errors_issued + 1
105 else:
106 the_link.page_name = canonical_name.replace('_', ' ')
107 if '#' in the_link.page_slug:
108 the_link.page_name_only, _ = the_link.page_slug.split('#')
109 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name))
110 find_section(the_link, True)
111 else:
112 pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name))
113
114# Test an interwiki link and look for a section link if applicable
115def test_interwiki_link(the_link):
116 global errors_issued
117 global unintended_redirects_found
118
119 the_link.curl_response = fetch(the_link.full_url)
120
121 # One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from
122 # pages such as Special:PermanentLink.
123 if the_link.curl_response.history != []:
124 possibly_print(the_link)
125
126 # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
127 if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
128 pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
129 find_canonical_link(the_link)
130 else:
131 permalink1 = 'Special:PermanentLink/'.lower()
132 permalink2 = 'Special:Permalink/'.lower()
133 page_slug_lower = the_link.page_slug.lower()
134 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
135 pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
136 find_canonical_link(the_link)
137 else:
138 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
139 errors_issued = errors_issued + 1
140 elif the_link.curl_response.status_code != 200:
141 possibly_print(the_link)
142 pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
143 errors_issued = errors_issued + 1
144 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this
145 # happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the
146 # page for the reader.
147 elif 'Redirected from <a' in the_link.curl_response.text:
148 unintended_redirects_found = unintended_redirects_found + 1
149 possibly_print(the_link)
150 pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
151 find_canonical_link(the_link) # calls find_section() at end
152 elif '#' in the_link.page_slug:
153 find_section(the_link, False)
154
155# Searches the given page text for interwiki links
156def scan_for_interwiki_links(page_text, page_name):
157 global debug
158 global pages_checked
159 global iw_found
160 global name_printed
161 pages_checked = pages_checked + 1
162 cur_prefix = 0
163 name_printed = 0
164
165 for prefix in interwiki_prefixes:
166 # Isolate strings that start with "[[prefix:" and end with "|" or "]"
167 iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])"
168 for match in re.finditer(iw_link, page_text):
169 the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", "", "", "", page_name, "")
170
171 # Extract just the page title from this regex match
172 s = match.start() + 2 + len(the_link.iw_prefix) + 1
173 e = match.end() - 1
174
175 # Use underscores in slug used to constructed URL, but retain spaces for printable name
176 the_link.page_slug = page_text[s:e].replace(' ', '_')
177 the_link.page_name = page_text[s:e]
178 if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, the_link.page_name))
179 iw_found = iw_found + 1
180
181 # Construct full URL for the particular wiki
182 the_link.full_url = the_link.prefix_url + the_link.page_slug
183
184 # Adjust URL if this is a foreign-language WP link
185 if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
186 lang_code = the_link.page_slug[0:2] + "."
187 # "wp:" is the Wikipedia: namespace, not a language
188 if lang_code != "wp." and lang_code != "WP.":
189 the_link.full_url = the_link.full_url.replace('en.', lang_code)
190 the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
191
192 # Test the URL
193 test_interwiki_link(the_link)
194 cur_prefix = cur_prefix + 1
195
196# Print a wrap-up message
197def print_summary():
198 global pages_checked
199 global iw_found
200 global errors_issued
201 global unintended_redirects_found
202
203 page_str = "pages"
204 if pages_checked == 1:
205 page_str = "page"
206
207 link_str = "links"
208 if iw_found == 1:
209 link_str = "link"
210
211 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
212
213 error_str = "errors were"
214 if errors_issued == 1:
215 error_str = "error was"
216
217 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
218
219 warning_str = "likely-unintended redirects were"
220 if unintended_redirects_found == 1:
221 warning_str = "likely-unintended redirect was"
222
223 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
224
225# Main function
226def main(*args):
227 global debug
228 search_cat = ''
229 search_page = ''
230
231 # Process arguments
232 local_args = pywikibot.handle_args(args)
233 for arg in local_args:
234 if arg.startswith('-cat:'):
235 search_cat = arg[5:]
236 elif arg.startswith('-page:'):
237 search_page = arg[6:]
238 elif arg == '-dbg':
239 debug = 1
240 else:
241 pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
242 return
243
244 #pywikibot.stdout('The members of the requests.models.Response class are:')
245 #pywikibot.stdout(format(dir(requests.models.Response)))
246 #return
247
248 # Check specified page or loop through specified category and check all pages
249 site = pywikibot.Site()
250 if search_cat != '':
251 cat_obj = pywikibot.Category(site, search_cat)
252 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
253 for page in pagegenerators.PreloadingGenerator(generator, 100):
254 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
255 scan_for_interwiki_links(page.text, page.title())
256 elif search_page != '':
257 page = pywikibot.Page(site, search_page)
258 if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
259 scan_for_interwiki_links(page.text, page.title())
260
261 # Print the results
262 print_summary()
263
264if __name__ == '__main__':
265 main()
Note: See TracBrowser for help on using the repository browser.