source: ValBot/check_intrawiki_section_links.py@ 1156

Last change on this file since 1156 was 1153, checked in by iritscen, 4 years ago

ValBot: Adding script for checking intrawiki section links. Updated read-me.

File size: 9.0 KB
RevLine 
[1153]1import os
2
3from urllib.parse import urljoin
4
5import pywikibot
6import re
7
8from pywikibot.bot import QuitKeyboardInterrupt
9from pywikibot import pagegenerators
10from pywikibot.tools.formatter import color_format
11from pywikibot.comms.http import fetch
12from pywikibot.specialbots import UploadRobot
13from bs4 import BeautifulSoup
14
15# Array of OniGalore's namespaces
16intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
17
18# URL for main namespace of our wiki
19onigalore_url = 'https://wiki.oni2.net/'
20
21# Interwiki prefixes, for ruling out these links
22interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
23
24pages_checked = 0
25iw_found = 0
26problems_found = 0
27page_name = ''
28
29# Searches the given page text for intrawiki links with section links in them
30def scan_for_iw_links(page_text):
31 global pages_checked
32 global iw_found
33 global problems_found
34 global page_name
35 pages_checked = pages_checked + 1
36
37 # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
38 # "[[any:thi|ng]]"
39 iw_link = "\[\[[^|\]]*(\||\])"
40 for match in re.finditer(iw_link, page_text):
41 found_iw_match = False
42 iw_url = ""
43 page_name2 = page_name
44
45 # Cut out the matched text from the page, and in the process remove the "[[" from the
46 # front and the "|" or "]" from the end
47 s = match.start() + 2
48 e = match.end() - 1
49 link_text = page_text[s:e]
50
51 # Sometimes we used a space char. instead of a '_', so fix that before querying
52 link_text = link_text.replace(' ', '_')
53 #pywikibot.output('Found link {0}.'.format(link_text))
54
55 # If this link doesn't have a section link in it, then we don't care about it, as
56 # MediaWiki takes care of checking basic intrawiki links
57 if not '#' in link_text:
58 #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
59 continue
60
61 # If there is a '{' in the link, then probably it's a link built on transcluded text
62 # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
63 if '{' in link_text:
64 pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
65 continue
66
67 # If this is a relative "../" link, find the parent page and set ourselves to that
68 # page, then remove the relative portion of the link. Note that this is only performed
69 # once, so if there's multiple steps back ("../../"), we're out of luck.
70 if link_text.startswith('../'):
71 last_slash = page_name.rfind('/')
72 page_name2 = page_name[0:last_slash]
73 #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
74 link_text = link_text[3:len(link_text)]
75 #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
76 # If this is now going to be a bare section link for the parent page, don't add
77 # a slash, otherwise do because we are drilling down to another subpage
78 if link_text.startswith('#'):
79 link_text = page_name2 + link_text
80 else:
81 link_text = page_name2 + '/' + link_text
82
83 # If this is a bare section link, build URL based on this page
84 if link_text.startswith('#'):
85 iw_url = onigalore_url + page_name2
86 iw_found = iw_found + 1
87 #pywikibot.output('Found link to this very page, {}.'.format(link_text))
88 found_iw_match = True
89 link_text = page_name2 + link_text
90
91 # If there's no ":" in the link (before the section link, where a colon would just be
92 # part of the text) then it's a Main namespace article, so construct URL
93 #if not ':' in link_text:
94 if found_iw_match == False:
95 if not re.search(":.*#", link_text):
96 iw_url = onigalore_url + link_text
97 iw_found = iw_found + 1
98 #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
99 found_iw_match = True
100
101 # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
102 if found_iw_match == False:
103 for prefix in intrawiki_prefixes:
104 #pywikibot.output('Comparing link against prefix {}.'.format(prefix))
105 if prefix + ":" in link_text:
106 iw_url = onigalore_url + link_text
107 _, post_ns = link_text.split(':', 1)
108 #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
109 iw_found = iw_found + 1
110 found_iw_match = True
111 break
112
113 # If we didn't match the prefix against any intrawiki prefixes, see if it matches
114 # against an interwiki prefix; if so, this link can be ignored
115 is_interwiki = False
116 if found_iw_match == False:
117 for prefix in interwiki_prefixes:
118 if prefix + ":" in link_text:
119 #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
120 is_interwiki = True
121 break
122 if is_interwiki:
123 continue
124
125 # If we still haven't turned this match into a URL, something's gone wrong
126 if (found_iw_match == False) or (iw_url == ""):
127 pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
128 quit()
129
130 # Test the URL
131 iw_url = iw_url.replace(' ', '_')
132 #pywikibot.output('Reading page at {}...'.format(iw_url))
133 response = fetch(iw_url)
134
135 # Redirects are followed automatically by fetch() and treated as "200"s, so the
136 # way we tell that a redirect occurred is by checking the history
137 if response.history != []:
138 pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
139 problems_found = problems_found + 1
140 elif response.status_code != 200:
141 #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
142 problems_found = problems_found + 1
143 else:
144 # Isolate section link
145 pre_section, section_name = link_text.split('#', 1)
146 #pywikibot.output('Searching for section link {} on page.'.format(section_name))
147
148 # Convert slash character to the dot-notation hex encoding that MediaWiki uses
149 section_name = section_name.replace('/', '.2F')
150
151 # Read linked page to see if it really has this anchor link
152 soup = BeautifulSoup(response.text, 'html.parser')
153 found_section = False
154 for span_tag in soup.findAll('span'):
155 span_name = span_tag.get('id', None)
156 if span_name == section_name:
157 #pywikibot.output('Found section!')
158 found_section = True
159 break
160 if found_section == False:
161 pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
162 problems_found = problems_found + 1
163
164def main(*args):
165 cat_name = ''
166 global page_name
167
168 local_args = pywikibot.handle_args(args)
169 genFactory = pagegenerators.GeneratorFactory()
170
171 for arg in local_args:
172 if arg.startswith('-cat:'):
173 cat_name = arg[5:]
174 elif arg.startswith('-page:'):
175 page_name = arg[6:]
176
177 site = pywikibot.Site()
178
179 # This line of code enumerates the methods in the 'page' class
180 #pywikibot.stdout(format(dir(page)))
181
182 if cat_name != '':
183 cat_obj = pywikibot.Category(site, cat_name)
184 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
185 for page in pagegenerators.PreloadingGenerator(generator, 100):
186 pywikibot.stdout('Checking page {0}'.format(page.title()))
187 page_name = page.title()
188 scan_for_iw_links(page.text)
189 elif page_name != '':
190 page = pywikibot.Page(site, page_name)
191 pywikibot.stdout('Checking page {0}'.format(page.title()))
192 scan_for_iw_links(page.text)
193
194 global pages_checked
195 global iw_found
196 global problems_found
197 pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))
198
199if __name__ == '__main__':
200 main()
Note: See TracBrowser for help on using the repository browser.