sdl2man

converts SDL Wiki pages to manpages
git clone git://nihaljere.xyz/sdl2man
Log | Files | Refs | README

sdl2man (8366B)


      1 #!/usr/bin/env python3
      2 
      3 import re
      4 import time
      5 import argparse
      6 import os
      7 import stat
      8 from pathlib import Path
      9 import requests
     10 import subprocess
     11 import shutil
     12 
     13 epilog = """The generated man pages will be located at $XDG_DATA_HOME/sdl2man/man (usually .local/share/sdl2man/man). They can be installed by running the script in the sdl2man directory, or by adding the man directory to your MANPATH."""
     14 
     15 parser = argparse.ArgumentParser(epilog=epilog)
     16 
     17 default_args = {'fresh' : False, 'generate' : False, 'pull' : False, 'time_interval' : 10}
     18 
     19 download_group = parser.add_mutually_exclusive_group()
     20 download_group.add_argument('-p', '--pull', action='store_true', help='downloads updated copies of changed pages')
     21 download_group.add_argument('-f', '--fresh', action='store_true', help='removes local copies of pages and redownloads')
     22 parser.add_argument('-t', '--time-interval', action='store', default=10, help='sets time interval on page downloads in seconds (to avoid rate limiting), default is 10')
     23 parser.add_argument('-g', '--generate', action='store_true', help='generates manpages from local copies')
     24 args = parser.parse_args()
     25 
     26 if vars(args) == default_args:
     27     parser.print_help()
     28 
     29 LOCAL = None
     30 try:
     31     LOCAL = Path(os.environ['XDG_DATA_HOME']) / 'sdl2man'
     32     LOCAL.mkdir(parents=True, exist_ok=True)
     33 except KeyError:
     34     LOCAL = Path.home() / '.local' / 'share' / 'sdl2man'
     35     LOCAL.mkdir(parents=True, exist_ok=True)
     36 
     37 
     38 root = "https://wiki.libsdl.org/"
     39 lastupdate_path = LOCAL / 'lastupdate'
     40 
     41 pages_path = LOCAL / 'pages'
     42 pages_path.mkdir(parents=True, exist_ok=True)
     43 
     44 man_path = LOCAL / 'man' / 'man3'
     45 man_path.mkdir(parents=True, exist_ok=True)
     46 
     47 install_path = Path('/usr/local/man')
     48 
     49 script_text = """#!/bin/sh
     50 
     51 mkdir -p {0}/man3
     52 cp -r man/* {0}
     53 """.format(install_path.as_posix())
     54 
     55 def retrieve_changed_pages():
     56     #current time as formatted in the rss
     57     lastupdate = None
     58     if lastupdate_path.exists():
     59         # gets rid of newline
     60         content = lastupdate_path.read_text()[:-1]
     61         if content:
     62             try:
     63                 lastupdate = time.strptime(content, '%Y%m%d%H%M%S')
     64             except ValueError as e:
     65                 raise Exception('lastupdate badly formatted') from e
     66     else:
     67         #checks if the pages directory is empty
     68         try:
     69             pages_path.iterdir().__next__()
     70         except StopIteration:
     71             print("It appears as if the program hasn't been run before, run with -f instead to download a copy of the documentation.")
     72 
     73     changed_pages = []
     74     changes_page = requests.get(root + 'RecentChanges').text
     75     for item in re.finditer('<rdf:li[^>]*?(SDL_[^\#]*)\#([0-9]*).*?></rdf:li>', changes_page, re.DOTALL):
     76         page = item.group(1)
     77         update_time = time.strptime(item.group(2), '%Y%m%d%H%M%S')
     78 
     79         if lastupdate < updatetime:
     80             changed_pages.append(page)
     81 
     82     return changed_pages
     83 
     84 def download_pages(page_list):
     85     for i, name in enumerate(page_list):
     86         page = requests.get(root + name)
     87         if '<h1>Surge protection</h1>' in page.text:
     88             print('Rate limited')
     89             break
     90         print("{}/{}".format(i+1, len(page_list)))
     91         page_path = pages_path / (name + '.html')
     92         if not page_path.exists():
     93             page_path.touch()
     94 
     95         page_path.write_text(page.text)
     96 
     97         time.sleep(args.time_interval)
     98     else:
     99         lastupdate_path.write_text(time.strftime('%Y%m%d%H%M%S'))
    100 
    101 def retrieve_page_list():
    102     api_page = requests.get(root + 'CategoryAPI')
    103     pages = []
    104 
    105     # isolates the different sections containing links
    106     for section in re.finditer('searchresults">(.*)</div>', api_page.text, re.DOTALL):
    107         # create a list of all of the names linked to
    108         for item in re.finditer('href="/([_a-zA-Z]*)\?', section.group(0)):
    109             pages.append(item.group(1))
    110     
    111     return pages
    112 
    113 def retrieve_local_page_list():
    114     pages = []
    115     for item in pages_path.iterdir():
    116         pages.append(item.stem)
    117 
    118     return pages
    119 
    120 # generates all man pages and install script
    121 def generate(pages):
    122     for i, page in enumerate(pages):
    123         path = pages_path / (page + '.html')
    124         text = path.read_text()
    125 
    126         man_text = html2man(page, text).encode('utf-8')
    127 
    128         (man_path / (page + '.3')).write_bytes(man_text)
    129 
    130         print('{}/{}'.format(i+1, len(pages)))
    131 
    132     install_script = LOCAL / 'install_pages.sh'
    133     install_script.write_text(script_text)
    134     mode_flags = os.stat(install_script.as_posix()).st_mode | stat.S_IXUSR
    135     os.chmod(install_script.as_posix(), mode_flags)
    136 
    137 # converts html from the SDL wiki into a man page
    138 def html2man(name, text):
    139     # trims the large chunks of fat
    140     text = re.sub(".*(?=<h1.*" + name + "</h1>)", '', text, count=1, flags=re.DOTALL)
    141     text = re.sub("<script.*?</script>", '', text, flags=re.DOTALL)
    142     text = re.sub("<hr.*", '', text, flags=re.DOTALL)
    143 
    144     # trims out smaller, pervasive chunks of fat
    145     text = re.sub("<span class=\"anchor\".*?</span>", '', text, flags=re.DOTALL)
    146     text = re.sub("<p class=.*?>", '', text, flags=re.DOTALL)
    147 
    148     # trim out very specific chunks of fat
    149     text = re.sub("<div class=\"table-of-contents.*?</div>", '', text, flags=re.DOTALL)
    150     text = re.sub("<span class=\"n\">(.*?)</span>", r'\1', text, flags=re.DOTALL)
    151     
    152     # replaces list of related structures at bottom of page with troff lists
    153     text = re.sub("<li style=\"list-style-type:none\">(.*?) ?</li>", '\\1, ', text, flags=re.DOTALL)
    154     text = re.sub("<ul>(.*?)</ul>", '\\1\n', text, flags=re.DOTALL)
    155 
    156     # replaces html headers with troff headers
    157     date = time.strftime('%Y-%m-%d')
    158     text = re.sub("<h1 id=.*?>(.*?)</h1>", r'.TH \1 3 ' + date + ' "sdl2man" "SDLWiki"', text, flags=re.DOTALL)
    159     text = re.sub("<h2 id=.*?>(.*?)</h2>", r'.SH \1', text, flags=re.DOTALL)
    160 
    161     # handle tables
    162     text = re.sub('<div><table><tbody>(.*?)</tbody></table></div>', '\n.TS\nallbox;\n\\1\n.TE\n', text, flags=re.DOTALL)
    163 
    164     tables = re.findall('\.TS.*?\.TE', text, re.DOTALL)
    165 
    166     row_lengths = []
    167     for table in tables:
    168         rows = int(len(re.findall('</?tr>', table))/2)
    169         entries = int(len(re.findall('</?td>', table))/2)
    170         row_lengths.append(int(entries / rows))
    171 
    172     text = re.sub('<tr>(.*?)</tr>', '\\1', text, flags=re.DOTALL)
    173     text = re.sub('<td>(.*?)</td>\n', 'T{\n\\1\nT}\t', text, flags=re.DOTALL)
    174     text = re.sub(' *T{', 'T{', text)
    175     text = re.sub('T}\t\n', 'T}\n', text)
    176 
    177     for row_length in row_lengths:
    178         text = re.sub('allbox;\nT{', 'allbox;\n' + 'l'*row_length + '.\nT{', text)
    179 
    180     # handles code areas
    181     text = re.sub('<div class="codearea".*?>.*?<pre.*?>(.*?)</pre></div>', '\\1', text, flags=re.DOTALL)
    182     text = re.sub('<span class="line">(.*?)</span>', '\\1\n', text, flags=re.DOTALL)
    183     text = re.sub('<span class="[a-z]*?">(.*?)</span>', '\\1', text)
    184     text = re.sub('(?:<span class="[a-z]*?">)(.*?)\n(.*?)(?:</span>)', '\\1\\2', text, flags=re.DOTALL)
    185     #gets rid of newlines which just separate single lines
    186 
    187     # html formatting
    188     text = re.sub('&quot;', '"', text)
    189     text = re.sub('&amp;', '&', text)
    190     text = re.sub('&lt;', '>', text)
    191     text = re.sub('&gt;', '>', text)
    192     #text = re.sub("<a href=\"/SDL_.*?\">(SDL_.*?)</a>", '\n.BR \\1\n', text, flags=re.DOTALL)
    193     text = re.sub("<a href=.*?>(.*?)</a>", '\\1', text, flags=re.DOTALL)
    194     text = re.sub("<strong>(.*?)</strong>", '\n.B \\1\n', text)
    195 
    196     #spacing
    197     #text = re.sub('\n[\n]+', '\n', text)
    198     text = re.sub('\n\.TE', '.TE\n', text)
    199     text = re.sub('\n\(\)([ .]?)', '()\\1\n', text)
    200 
    201 
    202     # gets rid of any left over commas at the end of related objects
    203     text = re.sub('(.SH Related.*?), \n', '\\1\n', text, flags=re.DOTALL)
    204     #cleans up remaining html tags
    205     text = re.sub('<(.*?).*?>(.*?)</\\1>', '\\2', text, flags=re.DOTALL)
    206 
    207     return text
    208 
    209 pages = None
    210 if args.pull:
    211     pages = retrieve_changed_pages() or []
    212     all_pages = retrieve_page_list()
    213     local_pages = retrieve_local_page_list()
    214 
    215     # retrieves both pages that aren't on the computer and those that have changed
    216     to_retrieve = [page for page in all_pages if page not in local_pages] + pages
    217     
    218     download_pages(to_retrieve)
    219 
    220 if args.fresh:
    221     pages = retrieve_page_list()
    222     download_pages(pages)
    223 
    224 if args.generate:
    225     if pages == None:
    226         pages = retrieve_local_page_list()
    227     generate(pages)