sdl2man (8366B)
1 #!/usr/bin/env python3 2 3 import re 4 import time 5 import argparse 6 import os 7 import stat 8 from pathlib import Path 9 import requests 10 import subprocess 11 import shutil 12 13 epilog = """The generated man pages will be located at $XDG_DATA_HOME/sdl2man/man (usually .local/share/sdl2man/man). They can be installed by running the script in the sdl2man directory, or by adding the man directory to your MANPATH.""" 14 15 parser = argparse.ArgumentParser(epilog=epilog) 16 17 default_args = {'fresh' : False, 'generate' : False, 'pull' : False, 'time_interval' : 10} 18 19 download_group = parser.add_mutually_exclusive_group() 20 download_group.add_argument('-p', '--pull', action='store_true', help='downloads updated copies of changed pages') 21 download_group.add_argument('-f', '--fresh', action='store_true', help='removes local copies of pages and redownloads') 22 parser.add_argument('-t', '--time-interval', action='store', default=10, help='sets time interval on page downloads in seconds (to avoid rate limiting), default is 10') 23 parser.add_argument('-g', '--generate', action='store_true', help='generates manpages from local copies') 24 args = parser.parse_args() 25 26 if vars(args) == default_args: 27 parser.print_help() 28 29 LOCAL = None 30 try: 31 LOCAL = Path(os.environ['XDG_DATA_HOME']) / 'sdl2man' 32 LOCAL.mkdir(parents=True, exist_ok=True) 33 except KeyError: 34 LOCAL = Path.home() / '.local' / 'share' / 'sdl2man' 35 LOCAL.mkdir(parents=True, exist_ok=True) 36 37 38 root = "https://wiki.libsdl.org/" 39 lastupdate_path = LOCAL / 'lastupdate' 40 41 pages_path = LOCAL / 'pages' 42 pages_path.mkdir(parents=True, exist_ok=True) 43 44 man_path = LOCAL / 'man' / 'man3' 45 man_path.mkdir(parents=True, exist_ok=True) 46 47 install_path = Path('/usr/local/man') 48 49 script_text = """#!/bin/sh 50 51 mkdir -p {0}/man3 52 cp -r man/* {0} 53 """.format(install_path.as_posix()) 54 55 def retrieve_changed_pages(): 56 #current time as formatted in the rss 57 lastupdate = None 58 if lastupdate_path.exists(): 59 # gets rid of newline 60 content = lastupdate_path.read_text()[:-1] 61 if content: 62 try: 63 lastupdate = time.strptime(content, '%Y%m%d%H%M%S') 64 except ValueError as e: 65 raise Exception('lastupdate badly formatted') from e 66 else: 67 #checks if the pages directory is empty 68 try: 69 pages_path.iterdir().__next__() 70 except StopIteration: 71 print("It appears as if the program hasn't been run before, run with -f instead to download a copy of the documentation.") 72 73 changed_pages = [] 74 changes_page = requests.get(root + 'RecentChanges').text 75 for item in re.finditer('<rdf:li[^>]*?(SDL_[^\#]*)\#([0-9]*).*?></rdf:li>', changes_page, re.DOTALL): 76 page = item.group(1) 77 update_time = time.strptime(item.group(2), '%Y%m%d%H%M%S') 78 79 if lastupdate < updatetime: 80 changed_pages.append(page) 81 82 return changed_pages 83 84 def download_pages(page_list): 85 for i, name in enumerate(page_list): 86 page = requests.get(root + name) 87 if '<h1>Surge protection</h1>' in page.text: 88 print('Rate limited') 89 break 90 print("{}/{}".format(i+1, len(page_list))) 91 page_path = pages_path / (name + '.html') 92 if not page_path.exists(): 93 page_path.touch() 94 95 page_path.write_text(page.text) 96 97 time.sleep(args.time_interval) 98 else: 99 lastupdate_path.write_text(time.strftime('%Y%m%d%H%M%S')) 100 101 def retrieve_page_list(): 102 api_page = requests.get(root + 'CategoryAPI') 103 pages = [] 104 105 # isolates the different sections containing links 106 for section in re.finditer('searchresults">(.*)</div>', api_page.text, re.DOTALL): 107 # create a list of all of the names linked to 108 for item in re.finditer('href="/([_a-zA-Z]*)\?', section.group(0)): 109 pages.append(item.group(1)) 110 111 return pages 112 113 def retrieve_local_page_list(): 114 pages = [] 115 for item in pages_path.iterdir(): 116 pages.append(item.stem) 117 118 return pages 119 120 # generates all man pages and install script 121 def generate(pages): 122 for i, page in enumerate(pages): 123 path = pages_path / (page + '.html') 124 text = path.read_text() 125 126 man_text = html2man(page, text).encode('utf-8') 127 128 (man_path / (page + '.3')).write_bytes(man_text) 129 130 print('{}/{}'.format(i+1, len(pages))) 131 132 install_script = LOCAL / 'install_pages.sh' 133 install_script.write_text(script_text) 134 mode_flags = os.stat(install_script.as_posix()).st_mode | stat.S_IXUSR 135 os.chmod(install_script.as_posix(), mode_flags) 136 137 # converts html from the SDL wiki into a man page 138 def html2man(name, text): 139 # trims the large chunks of fat 140 text = re.sub(".*(?=<h1.*" + name + "</h1>)", '', text, count=1, flags=re.DOTALL) 141 text = re.sub("<script.*?</script>", '', text, flags=re.DOTALL) 142 text = re.sub("<hr.*", '', text, flags=re.DOTALL) 143 144 # trims out smaller, pervasive chunks of fat 145 text = re.sub("<span class=\"anchor\".*?</span>", '', text, flags=re.DOTALL) 146 text = re.sub("<p class=.*?>", '', text, flags=re.DOTALL) 147 148 # trim out very specific chunks of fat 149 text = re.sub("<div class=\"table-of-contents.*?</div>", '', text, flags=re.DOTALL) 150 text = re.sub("<span class=\"n\">(.*?)</span>", r'\1', text, flags=re.DOTALL) 151 152 # replaces list of related structures at bottom of page with troff lists 153 text = re.sub("<li style=\"list-style-type:none\">(.*?) ?</li>", '\\1, ', text, flags=re.DOTALL) 154 text = re.sub("<ul>(.*?)</ul>", '\\1\n', text, flags=re.DOTALL) 155 156 # replaces html headers with troff headers 157 date = time.strftime('%Y-%m-%d') 158 text = re.sub("<h1 id=.*?>(.*?)</h1>", r'.TH \1 3 ' + date + ' "sdl2man" "SDLWiki"', text, flags=re.DOTALL) 159 text = re.sub("<h2 id=.*?>(.*?)</h2>", r'.SH \1', text, flags=re.DOTALL) 160 161 # handle tables 162 text = re.sub('<div><table><tbody>(.*?)</tbody></table></div>', '\n.TS\nallbox;\n\\1\n.TE\n', text, flags=re.DOTALL) 163 164 tables = re.findall('\.TS.*?\.TE', text, re.DOTALL) 165 166 row_lengths = [] 167 for table in tables: 168 rows = int(len(re.findall('</?tr>', table))/2) 169 entries = int(len(re.findall('</?td>', table))/2) 170 row_lengths.append(int(entries / rows)) 171 172 text = re.sub('<tr>(.*?)</tr>', '\\1', text, flags=re.DOTALL) 173 text = re.sub('<td>(.*?)</td>\n', 'T{\n\\1\nT}\t', text, flags=re.DOTALL) 174 text = re.sub(' *T{', 'T{', text) 175 text = re.sub('T}\t\n', 'T}\n', text) 176 177 for row_length in row_lengths: 178 text = re.sub('allbox;\nT{', 'allbox;\n' + 'l'*row_length + '.\nT{', text) 179 180 # handles code areas 181 text = re.sub('<div class="codearea".*?>.*?<pre.*?>(.*?)</pre></div>', '\\1', text, flags=re.DOTALL) 182 text = re.sub('<span class="line">(.*?)</span>', '\\1\n', text, flags=re.DOTALL) 183 text = re.sub('<span class="[a-z]*?">(.*?)</span>', '\\1', text) 184 text = re.sub('(?:<span class="[a-z]*?">)(.*?)\n(.*?)(?:</span>)', '\\1\\2', text, flags=re.DOTALL) 185 #gets rid of newlines which just separate single lines 186 187 # html formatting 188 text = re.sub('"', '"', text) 189 text = re.sub('&', '&', text) 190 text = re.sub('<', '>', text) 191 text = re.sub('>', '>', text) 192 #text = re.sub("<a href=\"/SDL_.*?\">(SDL_.*?)</a>", '\n.BR \\1\n', text, flags=re.DOTALL) 193 text = re.sub("<a href=.*?>(.*?)</a>", '\\1', text, flags=re.DOTALL) 194 text = re.sub("<strong>(.*?)</strong>", '\n.B \\1\n', text) 195 196 #spacing 197 #text = re.sub('\n[\n]+', '\n', text) 198 text = re.sub('\n\.TE', '.TE\n', text) 199 text = re.sub('\n\(\)([ .]?)', '()\\1\n', text) 200 201 202 # gets rid of any left over commas at the end of related objects 203 text = re.sub('(.SH Related.*?), \n', '\\1\n', text, flags=re.DOTALL) 204 #cleans up remaining html tags 205 text = re.sub('<(.*?).*?>(.*?)</\\1>', '\\2', text, flags=re.DOTALL) 206 207 return text 208 209 pages = None 210 if args.pull: 211 pages = retrieve_changed_pages() or [] 212 all_pages = retrieve_page_list() 213 local_pages = retrieve_local_page_list() 214 215 # retrieves both pages that aren't on the computer and those that have changed 216 to_retrieve = [page for page in all_pages if page not in local_pages] + pages 217 218 download_pages(to_retrieve) 219 220 if args.fresh: 221 pages = retrieve_page_list() 222 download_pages(pages) 223 224 if args.generate: 225 if pages == None: 226 pages = retrieve_local_page_list() 227 generate(pages)