#!/usr/bin/python # -*- coding: utf-8 -*- """ Deroff.py, ported to Python from the venerable deroff.c """ import sys, re, string class Deroffer: g_specs_specletter = { # Output composed latin1 letters '-D': '\320', 'Sd': '\360', 'Tp': '\376', 'TP': '\336', 'AE': '\306', 'ae': '\346', 'OE': "OE", 'oe': "oe", ':a': '\344', ':A': '\304', ':e': '\353', ':E': '\313', ':i': '\357', ':I': '\317', ':o': '\366', ':O': '\326', ':u': '\374', ':U': '\334', ':y': '\377', 'ss': '\337', '\'A': '\301', '\'E': '\311', '\'I': '\315', '\'O': '\323', '\'U': '\332', '\'Y': '\335', '\'a': '\341', '\'e': '\351', '\'i': '\355', '\'o': '\363', '\'u': '\372', '\'y': '\375', '^A': '\302', '^E': '\312', '^I': '\316', '^O': '\324', '^U': '\333', '^a': '\342', '^e': '\352', '^i': '\356', '^o': '\364', '^u': '\373', '`A': '\300', '`E': '\310', '`I': '\314', '`O': '\322', '`U': '\331', '`a': '\340', '`e': '\350', '`i': '\354', '`o': '\362', '`u': '\371', '~A': '\303', '~N': '\321', '~O': '\325', '~a': '\343', '~n': '\361', '~o': '\365', ',C': '\307', ',c': '\347', '/l': "/l", '/L': "/L", '/o': '\370', '/O': '\330', 'oA': '\305', 'oa': '\345', # Ligatures 'fi': 'fi', 'ff': 'ff', 'fl': 'fl', 'Fi': 'ffi', 'Ff': 'fff', 'Fl': 'ffl' } g_specs = { 'mi': '-', 'en': '-', 'hy': '-', 'em': "--", 'lq': "\"", # PCA: This used to be left and right smart quotes, but they look dumb 'rq': "\"", # So just use ordinary double quotes 'Bq': ",,", 'oq': '`', 'cq': '\'', 'aq': '\'', 'dq': '"', 'or': '|', 'at': '@', 'sh': '#', 'Eu': '\244', 'eu': '\244', 'Do': '$', 'ct': '\242', 'Fo': '\253', 'Fc': '\273', 'fo': '<', 'fc': '>', 'r!': '\241', 'r?': '\277', 'Of': '\252', 'Om': '\272', 'pc': '\267', 'S1': '\271', 'S2': '\262', 'S3': '\263', '<-': "<-", '->': "->", '<>': "<->", 'ua': '^', 'da': 'v', 'lA': "<=", 'rA': "=>", 'hA': "<=>", 'uA': "^^", 'dA': "vv", 'ba': '|', 'bb': '|', 'br': '|', 'bv': '|', 'ru': '_', 'ul': '_', 'ci': 'O', 'bu': 'o', 'co': '\251', 'rg': '\256', 'tm': "(TM)", 'dd': "||", 'dg': '|', 'ps': '\266', 'sc': '\247', 'de': '\260', '%0': "0/00", '14': '\274', '12': '\275', '34': '\276', 'f/': '/', 'sl': '/', 'rs': '\\', 'sq': "[]", 'fm': '\'', 'ha': '^', 'ti': '~', 'lB': '[', 'rB': ']', 'lC': '{', 'rC': '}', 'la': '<', 'ra': '>', 'lh': "<=", 'rh': "=>", 'tf': "therefore", '~~': "~~", '~=': "~=", '!=': "!=", '**': '*', '+-': '\261', '<=': "<=", '==': "==", '=~': "=~", '>=': ">=", 'AN': "\\/", 'OR': "/\\", 'no': '\254', 'te': "there exists", 'fa': "for all", 'Ah': "aleph", 'Im': "imaginary", 'Re': "real", 'if': "infinity", 'md': "\267", 'mo': "member of", 'mu': '\327', 'nm': "not member of", 'pl': '+', 'eq': '=', 'pt': "oc", 'pp': "perpendicular", 'sb': "(=", 'sp': "=)", 'ib': "(-", 'ip': "-)", 'ap': '~', 'is': 'I', 'sr': "root", 'pd': 'd', 'c*': "(x)", 'c+': "(+)", 'ca': "cap", 'cu': 'U', 'di': '\367', 'gr': 'V', 'es': "{}", 'CR': "_|", 'st': "such that", '/_': "/_", 'lz': "<>", 'an': '-', # Output Greek '*A': "Alpha", '*B': "Beta", '*C': "Xi", '*D': "Delta", '*E': "Epsilon", '*F': "Phi", '*G': "Gamma", '*H': "Theta", '*I': "Iota", '*K': "Kappa", '*L': "Lambda", '*M': "Mu", '*N': "Nu", '*O': "Omicron", '*P': "Pi", '*Q': "Psi", '*R': "Rho", '*S': "Sigma", '*T': "Tau", '*U': "Upsilon", '*W': "Omega", '*X': "Chi", '*Y': "Eta", '*Z': "Zeta", '*a': "alpha", '*b': "beta", '*c': "xi", '*d': "delta", '*e': "epsilon", '*f': "phi", '+f': "phi", '*g': "gamma", '*h': "theta", '+h': "theta", '*i': "iota", '*k': "kappa", '*l': "lambda", '*m': "\265", '*n': "nu", '*o': "omicron", '*p': "pi", '+p': "omega", '*q': "psi", '*r': "rho", '*s': "sigma", '*t': "tau", '*u': "upsilon", '*w': "omega", '*x': "chi", '*y': "eta", '*z': "zeta", 'ts': "sigma", } g_re_word = re.compile(r'[a-zA-Z_]+') # equivalent to the word() method g_re_number = re.compile(r'[+-]?\d+') # equivalent to the number() method g_re_esc_char = re.compile(r"""([a-zA-Z_]) | # Word ([+-]?\d) | # Number \\ # Backslash (for escape seq) """, re.VERBOSE) g_re_not_backslash_or_whitespace = re.compile(r'[^ \t\n\r\f\v\\]+') # Match a sequence of not backslash or whitespace g_re_newline_collapse = re.compile(r'\n{3,}') g_re_font = re.compile(r"""\\f( # Starts with backslash f (\(\S{2}) | # Open paren, then two printable chars (\[\S*?\]) | # Open bracket, zero or more printable characters, then close bracket \S) # Any printable character """, re.VERBOSE) # This gets filled in in __init__ below g_macro_dict = False def __init__(self): self.reg_table = {} self.tr_from = '' self.tr_to = '' self.tr = '' self.nls = 2 self.specletter = False self.refer = False self.macro = 0 self.nobody = False self.inlist = False self.inheader = False self.pic = False self.tbl = False self.tblstate = 0 self.tblTab = '' self.eqn = False self.skipheaders = False self.skiplists = False self.ignore_sonx = False self.output = [] self.OPTIONS = 0 self.FORMAT = 1 self.DATA = 2 # words is uninteresting and should be treated as false if not Deroffer.g_macro_dict: Deroffer.g_macro_dict = { 'SH': Deroffer.macro_sh, 'SS': Deroffer.macro_ss_ip, 'IP': Deroffer.macro_ss_ip, 'H ': Deroffer.macro_ss_ip, 'I ': Deroffer.macro_i_ir, 'IR': Deroffer.macro_i_ir, 'IB': Deroffer.macro_i_ir, 'B ': Deroffer.macro_i_ir, 'BR': Deroffer.macro_i_ir, 'BI': Deroffer.macro_i_ir, 'R ': Deroffer.macro_i_ir, 'RB': Deroffer.macro_i_ir, 'RI': Deroffer.macro_i_ir, 'AB': Deroffer.macro_i_ir, '] ': Deroffer.macro_close_bracket, 'PS': Deroffer.macro_ps, 'PE': Deroffer.macro_pe, 'TS': Deroffer.macro_ts, 'T&': Deroffer.macro_t_and, 'TE': Deroffer.macro_te, 'EQ': Deroffer.macro_eq, 'EN': Deroffer.macro_en, 'R1': Deroffer.macro_r1, 'R2': Deroffer.macro_r2, 'de': Deroffer.macro_de, 'BL': Deroffer.macro_bl_vl, 'VL': Deroffer.macro_bl_vl, 'AL': Deroffer.macro_bl_vl, 'LB': Deroffer.macro_bl_vl, 'RL': Deroffer.macro_bl_vl, 'ML': Deroffer.macro_bl_vl, 'DL': Deroffer.macro_bl_vl, 'BV': Deroffer.macro_bv, 'LE': Deroffer.macro_le, 'LP': Deroffer.macro_lp_pp, 'PP': Deroffer.macro_lp_pp, 'P\n': Deroffer.macro_lp_pp, 'ds': Deroffer.macro_ds, 'so': Deroffer.macro_so_nx, 'nx': Deroffer.macro_so_nx, 'tr': Deroffer.macro_tr, 'sp': Deroffer.macro_sp } def flush_output(self, where): if where: where.write(self.get_output()) self.output[:] = [] def get_output(self): res = ''.join(self.output) clean_res = Deroffer.g_re_newline_collapse.sub('\n', res) return clean_res def putchar(self, c): self.output.append(c) return c # This gets swapped in in place of condputs the first time tr gets modified def condputs_tr(self, str): special = self.pic or self.eqn or self.refer or self.macro or (self.skiplists and self.inlist) or (self.skipheaders and self.inheader) if not special: self.output.append(str.translate(self.tr)) def condputs(self, str): special = self.pic or self.eqn or self.refer or self.macro or (self.skiplists and self.inlist) or (self.skipheaders and self.inheader) if not special: self.output.append(str) def str_at(self, idx): return self.s[idx:idx+1] def skip_char(self, amt=1): self.s = self.s[amt:] def skip_leading_whitespace(self): self.s = self.s.lstrip() def is_white(self, idx): # Note this returns false for empty strings (idx >= len(self.s)) return self.s[idx:idx+1].isspace() def str_eq(offset, other, len): return self.s[offset:offset+len] == other[:len] def prch(self, idx): # Note that this return False for the empty string (idx >= len(self.s)) ch = self.s[idx:idx+1] return ch not in ' \t\n' def font(self): match = Deroffer.g_re_font.match(self.s) if not match: return False self.skip_char(match.end()) return True def font2(self): if self.s[0:2] == '\\f': c = self.str_at(2) if c == '(' and self.prch(3) and self.prch(4): self.skip_char(5) return True elif c == '[': self.skip_char(2) while self.prch(0) and self.str_at(0) != ']': self.skip_char() if self.str_at(0) == ']': self.skip_char() elif self.prch(2): self.skip_char(3) return True return False def comment(self): # Here we require that the string start with \" while self.str_at(0) and self.str_at(0) != '\n': self.skip_char() return True def numreq(self): # We require that the string starts with backslash if self.str_at(1) in 'hvwud' and self.str_at(2) == '\'': self.macro += 1 self.skip_char(3) while self.str_at(0) != '\'' and self.esc_char(): pass # Weird if self.str_at(0) == '\'': self.skip_char() self.macro -= 1 return True return False def var(self): reg = '' s0s1 = self.s[0:2] if s0s1 == '\\n': if self.s[3:5] == 'dy': self.skip_char(5) return True elif self.str_at(2) == '(' and self.prch(3) and self.prch(4): self.skip_char(5) return True elif self.str_at(2) == '[' and self.prch(3): self.skip_char(3) while self.str_at(0) and self.str_at(0) != ']': self.skip_char() return True elif self.prch(2): self.skip_char(3) return True elif s0s1 == '\\*': if self.str_at(2) == '(' and self.prch(3) and self.prch(4): reg = self.s[3:5] self.skip_char(5) elif self.str_at(2) == '[' and self.prch(3): self.skip_char(3) while self.str_at(0) and self.str_at(0) != ']': reg = reg + self.str_at(0) self.skip_char() if self.s[0:1] == ']': self.skip_char() else: return False elif self.prch(2): reg = self.str_at(2) self.skip_char(3) else: return False if reg in self.reg_table: old_s = self.s self.s = self.reg_table[reg] self.text_arg() return True return False def size(self): # We require that the string starts with \s if self.digit(2) or (self.str_at(2) in '-+' and self.digit(3)): self.skip_char(3) while self.digit(0): self.skip_char() return True return False def spec(self): self.specletter = False if self.s[0:2] == '\\(' and self.prch(2) and self.prch(3): key = self.s[2:4] if key in Deroffer.g_specs_specletter: self.condputs(Deroffer.g_specs_specletter[key]) self.specletter = True elif key in Deroffer.g_specs: self.condputs(Deroffer.g_specs[key]) self.skip_char(4) return True elif self.s.startswith('\\%'): self.specletter = True self.skip_char(2) return True else: return False def esc(self): # We require that the string start with backslash c = self.s[1:2] if not c: return False if c in 'eE': self.condputs('\\') elif c in 't': self.condputs('\t') elif c in '0~': self.condputs(' ') elif c in '|^&:': pass else: self.condputs(c) self.skip_char(2) return True def word(self): got_something = False while True: match = Deroffer.g_re_word.match(self.s) if not match: break got_something = True self.condputs(match.group(0)) self.skip_char(match.end(0)) # Consume all specials while self.spec(): if not self.specletter: break return got_something def text(self): while True: idx = self.s.find('\\') if idx == -1: self.condputs(self.s) self.s = '' break else: self.condputs(self.s[:idx]) self.skip_char(idx) if not self.esc_char_backslash(): self.condputs(self.str_at(0)) self.skip_char() return True def letter(self, idx): ch = self.str_at(idx) return ch.isalpha() or ch == '_' # underscore is used in C identifiers def digit(self, idx): ch = self.str_at(idx) return ch.isdigit() def number(self): match = Deroffer.g_re_number.match(self.s) if not match: return False else: self.condputs(match.group(0)) self.skip_char(match.end()) return True def esc_char_backslash(self): # Like esc_char, but we know the string starts with a backslash c = self.s[1:2] if c == '"': return self.comment() elif c == 'f': return self.font() elif c == 's': return self.size() elif c in 'hvwud': return self.numreq() elif c in 'n*': return self.var() elif c == '(': return self.spec() else: return self.esc() def esc_char(self): if self.s[0:1] == '\\': return self.esc_char_backslash() return self.word() or self.number() def quoted_arg(self): if self.str_at(0) == '"': self.skip_char() while self.s and self.str_at(0) != '"': if not self.esc_char(): if self.s: self.condputs(self.str_at(0)) self.skip_char() return True else: return False def text_arg(self): # PCA: The deroff.c textArg() disallowed quotes at the start of an argument # I'm not sure if this was a bug or not got_something = False while True: match = Deroffer.g_re_not_backslash_or_whitespace.match(self.s) if match: # Output the characters in the match self.condputs(match.group(0)) self.skip_char(match.end(0)) got_something = True # Next is either an escape, or whitespace, or the end # If it's the whitespace or the end, we're done if not self.s or self.is_white(0): return got_something # Try an escape if not self.esc_char(): # Some busted escape? Just output it self.condputs(self.str_at(0)) self.skip_char() got_something = True def text_arg2(self): if not self.esc_char(): if self.s and not self.is_white(0): self.condputs(self.str_at(0)) self.skip_char() else: return False while True: if not self.esc_char(): if self.s and not self.is_white(0): self.condputs(self.str_at(0)) self.skip_char() else: return True # Macro functions def macro_sh(self): for header_str in [' SYNOPSIS', ' "SYNOPSIS', ' ‹BERSICHT', ' "‹BERSICHT']: if self.s[2:].startswith(header_str): self.inheader = True break else: # Did not find a header string self.inheader = False self.nobody = True def macro_ss_ip(self): self.nobody = True return False def macro_i_ir(self): pass return False def macro_close_bracket(self): self.refer = False return False def macro_ps(self): if self.is_white(2): self.pic = True self.condputs('\n') return True def macro_pe(self): if self.is_white(2): self.pic = False self.condputs('\n') return True def macro_ts(self): if self.is_white(2): self.tbl, self.tblstate = True, self.OPTIONS self.condputs('\n') return True def macro_t_and(self): if self.is_white(2): self.tbl, self.tblstate = True, self.FORMAT self.condputs('\n') return True def macro_te(self): if self.is_white(2): self.tbl = False self.condputs('\n') return True def macro_eq(self): if self.is_white(2): self.eqn = True self.condputs('\n') return True def macro_en(self): if self.is_white(2): self.eqn = False self.condputs('\n') return True def macro_r1(self): if self.is_white(2): self.refer2 = True self.condputs('\n') return True def macro_r2(self): if self.is_white(2): self.refer2 = False self.condputs('\n') return True def macro_de(self): macro=True self.condputs('\n') return True def macro_bl_vl(self): if self.is_white(2): self.inlist = True self.condputs('\n') return True def macro_bv(self): if self.str_at(2) == 'L' and self.white(self.str_at(3)): self.inlist = True self.condputs('\n') return True def macro_le(self): if self.is_white(2): self.inlist = False self.condputs('\n') return True def macro_lp_pp(self): self.condputs('\n') return True def macro_ds(self): self.skip_char(2) self.skip_leading_whitespace() if self.str_at(0): # Split at whitespace comps = self.s.split(None, 2) if len(comps) is 2: name, value = comps value = value.rstrip() self.reg_table[name] = value self.condputs('\n') return True def macro_so_nx(self): # We always ignore include directives # deroff.c for some reason allowed this to fall through to the 'tr' case # I think that was just a bug so I won't replicate it return True def macro_tr(self): self.skip_char(2) self.skip_leading_whitespace() while self.s and self.str_at(0) != '\n': c = self.str_at(0) ns = self.str_at(1) self.skip_char(2) if not ns or ns == '\n': ns = ' ' self.tr_from += c self.tr_to += ns # Update our table, then swap in the slower tr-savvy condputs self.tr = string.maketrans(self.tr_from, self.tr_to) self.condputs = self.condputs_tr return True def macro_sp(self): self.condputs('\n') return True def macro_other(self): self.condputs('\n') return True def request_or_macro(self): # s[0] is period or open single quote self.skip_char() s0 = self.s[1:2] if s0 == '\\': if self.str_at(1) == '"': self.condputs('\n') return True else: pass elif s0 == '[': self.refer = True self.condputs('\n') return True elif s0 == ']': self.refer = False self.skip_char() return self.text() elif s0 == '.': self.macro = False self.condputs('\n') return True self.nobody = False s0s1 = self.s[0:2] macro_func = Deroffer.g_macro_dict.get(s0s1, Deroffer.macro_other) if macro_func(self): return True if self.skipheaders and self.nobody: return True self.skip_leading_whitespace() while self.s and not self.is_white(0): self.skip_char() self.skip_leading_whitespace() while True: if not self.quoted_arg() and not self.text_arg(): if self.s: self.condputs(self.str_at(0)) self.skip_char() else: return True def request_or_macro2(self): self.skip_char() s0 = self.s[0:1] if s0 == '\\': if self.str_at(1) == '"': self.condputs('\n') return True else: pass elif s0 == '[': self.refer = True self.condputs('\n') return True elif s0 == ']': self.refer = False self.skip_char() return self.text() elif s0 == '.': self.macro = False self.condputs('\n') return True self.nobody = False s0s1 = self.s[0:2] if s0s1 == 'SH': for header_str in [' SYNOPSIS', ' "SYNOPSIS', ' ‹BERSICHT', ' "‹BERSICHT']: if self.s[2:].startswith(header_str): self.inheader = True break else: # Did not find a header string self.inheader = False self.nobody = True elif s0s1 in ['SS', 'IP', 'H ']: self.nobody = True elif s0s1 in ['I ', 'IR', 'IB', 'B ', 'BR', 'BI', 'R ', 'RB', 'RI', 'AB']: pass elif s0s1 in ['] ']: self.refer = False elif s0s1 in ['PS']: if self.is_white(2): self.pic = True self.condputs('\n') return True elif s0s1 in ['PE']: if self.is_white(2): self.pic = False self.condputs('\n') return True elif s0s1 in ['TS']: if self.is_white(2): self.tbl, self.tblstate = True, self.OPTIONS self.condputs('\n') return True elif s0s1 in ['T&']: if self.is_white(2): self.tbl, self.tblstate = True, self.FORMAT self.condputs('\n') return True elif s0s1 in ['TE']: if self.is_white(2): self.tbl = False self.condputs('\n') return True elif s0s1 in ['EQ']: if self.is_white(2): self.eqn = True self.condputs('\n') return True elif s0s1 in ['EN']: if self.is_white(2): self.eqn = False self.condputs('\n') return True elif s0s1 in ['R1']: if self.is_white(2): self.refer2 = True self.condputs('\n') return True elif s0s1 in ['R2']: if self.is_white(2): self.refer2 = False self.condputs('\n') return True elif s0s1 in ['de']: macro=True self.condputs('\n') return True elif s0s1 in ['BL', 'VL', 'AL', 'LB', 'RL', 'ML', 'DL']: if self.is_white(2): self.inlist = True self.condputs('\n') return True elif s0s1 in ['BV']: if self.str_at(2) == 'L' and self.white(self.str_at(3)): self.inlist = True self.condputs('\n') return True elif s0s1 in ['LE']: if self.is_white(2): self.inlist = False self.condputs('\n') return True elif s0s1 in ['LP', 'PP', 'P\n']: self.condputs('\n') return True elif s0s1 in ['ds']: self.skip_char(2) self.skip_leading_whitespace() if self.str_at(0): # Split at whitespace comps = self.s.split(None, 2) if len(comps) is 2: name, value = comps value = value.rstrip() self.reg_table[name] = value self.condputs('\n') return True elif s0s1 in ['so', 'nx']: # We always ignore include directives # deroff.c for some reason allowed this to fall through to the 'tr' case # I think that was just a bug so I won't replicate it return True elif s0s1 in ['tr']: self.skip_char(2) self.skip_leading_whitespace() while self.s and self.str_at(0) != '\n': c = self.str_at(0) ns = self.str_at(1) self.skip_char(2) if not ns or ns == '\n': ns = ' ' self.tr_from += c self.tr_to += ns # Update our table, then swap in the slower tr-savvy condputs self.tr = string.maketrans(self.tr_from, self.tr_to) self.condputs = self.condputs_tr return True elif s0s1 in ['sp']: self.condputs('\n') return True else: self.condputs('\n') return True if self.skipheaders and self.nobody: return True self.skip_leading_whitespace() while self.s and not self.is_white(0): self.skip_char() self.skip_leading_whitespace() while True: if not self.quoted_arg() and not self.text_arg(): if self.s: self.condputs(self.str_at(0)) self.skip_char() else: return True def do_tbl(self): if self.tblstate == self.OPTIONS: while self.s and self.str_at(0) != ';' and self.str_at(0) != '\n': self.skip_leading_whitespace() if not self.str_at(0).isalpha(): # deroff.c has a bug where it can loop forever here...we try to work around it self.skip_char() else: # Parse option option = self.s arg = '' idx = 0 while option[idx:idx+1].isalpha(): idx += 1 if option[idx:idx+1] == '(': option = option[:idx] self.s = self.s[idx+1:] arg = self.s else: self.s = '' if arg: idx = arg.find(')') if idx != -1: arg = arg[:idx] self.s = self.s[idx+1:] else: #self.skip_char() pass if option.lower() == 'tab': self.tblTab = arg[0:1] self.tblstate = self.FORMAT self.condputs('\n') elif self.tblstate == self.FORMAT: while self.s and self.str_at(0) != '.' and self.str_at(0) != '\n': self.skip_leading_whitespace() if self.str_at(0): self.skip_char() if self.str_at(0) == '.': self.tblstate = self.DATA self.condputs('\n') elif self.tblstate == self.DATA: if self.tblTab: self.s = self.s.replace(self.tblTab, '\t') self.text() return True def do_line(self): if self.s[0:1] in ".'": if not self.request_or_macro(): return False elif self.tbl: self.do_tbl() else: self.text() return True def deroff(self, str): lines = str.split('\n') for line in lines: self.s = line + '\n' if not self.do_line(): break #self.putchar('\n') def deroff_files(files): for arg in files: print >> sys.stderr, arg if arg.endswith('.gz'): f = gzip.open(arg, 'r') else: f = open(arg, 'r') str = f.read() d = Deroffer() d.deroff(str) d.flush_output(sys.stdout) f.close() if __name__ == "__main__": import gzip paths = sys.argv[1:] if False: deroff_files(paths) else: import cProfile, profile, pstats profile.run('deroff_files(paths)', 'fooprof') p = pstats.Stats('fooprof') p.sort_stats('time').print_stats(100) #p.sort_stats('calls').print_callers(.5, 'startswith')