#!/usr/bin/env python3 # -*- encoding: utf-8 ## indenter.py - Top-level indenter for all files ## ## Copyright ©2013 Ben Longbons ## ## This file is part of The Mana World (Athena server) ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . from collections import namedtuple import io import string import subprocess import sys # Settings. class LexSettings: pad = 2 indent = 4 brace = True nested_indent = 0 # 4 # Code. Location = namedtuple('Location', ('name', 'line', 'column', 'text')) if 1: def _diagnostic(self, level, msg, to): print('{file}:{line}:{column}: {level}: {msg}'.format( file=self.name, line=self.line, column=self.column, level=level, msg=msg), file=sys.stderr) print(self.text, file=sys.stderr) if to: assert to.name == self.name assert to.line == self.line assert to.column >= self.column else: to = self print(' ' * (self.column - 1) + '^' + '~' * (to.column - self.column), file=sys.stderr) def error(self, msg, to=None): self._diagnostic('error', msg, to) def warning(self, msg, to=None): self._diagnostic('warning', msg, to) def note(self, msg, to=None): self._diagnostic('note', msg, to) def fatal(self, msg, to=None): self.error(msg, to) sys.exit(1) Location._diagnostic = _diagnostic Location.error = error Location.warning = warning Location.note = note Location.fatal = fatal del _diagnostic, error, warning, note, fatal class Reader(object): __slots__ = ('_name', '_stream', '_buffer', '_line', '_column') def __init__(self, name, stream, line=1, column=1): ''' Create a new character reader that is smart with lines. ''' self._name = name self._stream = stream self._buffer = '\n' self._line = line - 1 self._column = 0 column -= 1 self.adv() self._buffer = ' ' * column + self._buffer self._column = column # no skew on input (actually belongs below) def get(self): ''' Fetch the current character, or falsy on EOF ''' if self._buffer: return self._buffer[self._column] else: return None # less prone to accidental errors than '' def loc(self): ''' Fetch the Location of the current character. ''' # internally we store 0-based, but users want 1-based # also, cut off the newline return Location(self._name, self._line, self._column + 1, self._buffer[:-1]) def adv(self): if self._buffer[self._column] == '\n': self._buffer = self._stream.readline() self._line += 1 self._column = 0 if self._buffer and not self._buffer.endswith('\n'): self._buffer += '\n' else: self._column += 1 def string_reader(s, name='', line=1, column=1): return Reader(name, io.StringIO(s), line, column) def take_while(b, r, s): assert isinstance(b, bytearray) assert isinstance(r, Reader) s = frozenset(s) while True: c = r.get() if not c or c not in s: break b += c r.adv() def take_mlc(b, r): assert isinstance(b, bytearray) assert isinstance(r, Reader) star = False while True: c = r.get() r.adv() b += c if star and c == '/': return star = c == '*' def take_slc(b, r): assert isinstance(b, bytearray) assert isinstance(r, Reader) bs = False while True: c = r.get() # if c == '\n': return if c == '\n' and not bs: return r.adv() b += c bs = c == '\\' def take_char(b, r): assert isinstance(b, bytearray) assert isinstance(r, Reader) bs = False while True: c = r.get() r.adv() b += c if not bs and c == '\'': return bs = not bs and c == '\\' def take_str(b, r): assert isinstance(b, bytearray) assert isinstance(r, Reader) bs = False while True: c = r.get() r.adv() b += c if not bs and c == '"': return bs = not bs and c == '\\' def round_up(i, a): m = i % a if m: i += (a - m) return i class Table: ''' Aligned output ''' def __init__(self): self.buf = [] self.size = 0 def put1(self, line): line = line.rstrip() self.buf.append((line, '')) def put2(self, left, right): left = left.rstrip() right = right.strip() self.buf.append((left, right)) if right and len(left) > self.size: self.size = len(left) def flush(self): self.size += LexSettings.pad self.size = round_up(self.size, LexSettings.indent) for l, r in self.buf: if not r: sys.stdout.writelines([l, '\n']) else: need = self.size - len(l) sys.stdout.writelines([l, ' ' * need, r, '\n']) del self.buf[:] self.size = 0 def format_lex_or_yacc_definitions(): 'definitions section (mostly used for options actually)' table = Table() in_code = False code = '' for line in sys.stdin: if line == '%%\n': break if line == '%{\n': in_code = True continue if in_code: if line == '%}\n': in_code = False continue code += line continue if not line.strip() or line != line.lstrip(): # starts with whitespace or is an empty line ('\n') code += line continue if code.strip(): if LexSettings.brace: table.put1('%{') for line2 in indent_cpp_slop(code): table.put1(LexSettings.nested_indent * ' ' + line2) if LexSettings.brace: table.put1('%}') table.put1('') elif code: table.put1('') code = '' if line.startswith('%'): # %top is flex, %code and %union are bison union = line.startswith('%union') if union or line.startswith('%top') or line.startswith('%code'): # TODO fix stupidity when in strings or comments count = line.count('{') #code = '' if union: assert count <= 1 code += line[1:] else: if count: assert count == 1 code += line[line.find('{'):] table.put1(line[:line.find('{')]) else: table.put1(line.rstrip()) assert line.count('}') == 0 for line in sys.stdin: count += line.count('{') - line.count('}') code += line assert count >= 0 if count == 0: break if union: first = True for line2 in indent_cpp_slop(code): if first: line2 = '%' + line2 first = False table.put1(line2) else: for line2 in indent_cpp_slop(code): table.put1(LexSettings.nested_indent * ' ' + line2) code = '' else: table.put1(line) elif line[0].isalpha() or line[0] == '_': table.put2(*line.split(None, 1)) else: table.put1(line) assert not in_code del code del in_code table.flush() sys.stdout.write('\n%%\n') def format_lex_rules(): 'rule section' table = Table() for line in sys.stdin: if line == '%%\n': break if line.startswith('<') and not line.startswith('<<'): raise NotImplementedError('start conditions not yet supported') i = 0 p = 0 bs = False while True: if bs: bs = False i += 1 continue if line[i] == '\\': bs = True i += 1 continue if not p and line[i].isspace(): break if line[i] == '"': i += 1 while line[i] != '"': if line[i] == '\\': i += 1 i += 1 elif line[i] == '[': i += 1 if line[i] == '^': i += 1 while line[i] != ']': i += 1 elif line[i] == '(': p += 1 elif line[i] == ')': assert p p -= 1 i += 1 if not i: table.put1('') continue del bs del p pattern = line[:i] rule = line[i:] del i count = rule.count('{') - rule.count('}') while count: blah = next(sys.stdin) rule += blah count += blah.count('{') - blah.count('}') rules = indent_cpp_slop(rule) table.put2(pattern, rules[0]) for line in rules[1:]: table.put1(line) table.flush() sys.stdout.write('%%\n') def format_yacc_rules(): ''' tokens are any of: word word[namedref] 'c' "str" { code } break before { break twice before a : or | break twice before and thrice after a ; put a softspace after everything except ; ''' sys.stdout.write('\n') softspace = '' # NOT reset by new lines for line in sys.stdin: if line == '%%\n': break line = line.strip() while line: if line.startswith("'"): bs = False for i, c in enumerate(line): if bs: continue bs = c == '\\' if i and c == "'": break else: raise Exception('broken char') i += 1 word = line[:i] line = line[i:].lstrip() sys.stdout.writelines([softspace, word]) softspace = ' ' continue if line.startswith('"'): for i, c in enumerate(line): if bs: continue bs = c == '\\' if i and c == '"': break else: raise Exception('broken string') i += 1 word = line[:i] line = line[i:].lstrip() sys.stdout.writelines([softspace, word]) softspace = ' ' continue if line.startswith(':'): line = line[1:].lstrip() sys.stdout.write('\n\n:') softspace = ' ' continue if line.startswith('{'): line += '\n' lines = '' # TODO fix braces in comments and strings lo = 1 behold = 1 while behold: i = line.find('}', lo) if i == -1: behold += line[lo:].count('{') lines += line line = next(sys.stdin) lo = 0 else: behold -= 1 i += 1 behold += line[lo:i].count('{') lo = i lines += line[:lo] for line2 in indent_cpp_slop(lines): sys.stdout.writelines(['\n', line2]) line = line[lo:].strip() softspace = ' ' continue if line.startswith(';'): line = line[1:].lstrip() sys.stdout.write('\n\n;\n\n\n') softspace = '' continue if line.startswith('|'): line = line[1:].lstrip() sys.stdout.write('\n\n|') softspace = ' ' continue # screw comments word, _, line = line.partition(' ') line = line.lstrip() if word.endswith(':'): word = word[-1] line = ':' + line sys.stdout.writelines([softspace, word]) softspace = ' ' continue # while line # for line in stdin sys.stdout.write('%%\n') def format_lex(): ''' A lex file is a series of sections. In the initial section: If it begins with whitespace, it is indented code It might be a /* comment */ It might be a #line It might be a %s, %x, %pointer, %array, %option %[a-z][0-9].* It might be a %{ codeblock %} It might be a %top { codeblock } It might be a name and an expansion A %% switches to the second section In a comment: */ is the end In a codeblock: if it started with %{, %} ends it if it started with %top{, } ends it if it matches the nesting In section 2's header: there may be %{ %} sections, possibly nested there may also be indented code there may be unindented code if it's inside the %{ %} In section 2 proper: pattern action pattern action { pattern action } a %% switches to section 3 In section 3: everything is just C code ''' format_lex_or_yacc_definitions() format_lex_rules() format_cc() def format_yacc(): ''' A yacc file is a series of sections. In the initial section: whitespace and comments are ignored. %someoption = | ; name name: int 'char' "string" <*> <> %{ prologue %} { braced code } [ bracketed identifier ] %% switch to section 2 In the second section: is actually the same! wtf? But in practice: name: symbol 'c' "str" { code } | symbol 'c' "str" { code } /* in any order */ ; any name may instead be name[namedref] code may additionally contain $$, $1, $namedref In section 3: everything is C code. ''' format_lex_or_yacc_definitions() format_yacc_rules() format_cc() def format_cc(): sys.stdout.flush() tail = subprocess.Popen(['indenter-cpp'], stdin=subprocess.PIPE, stdout=None) tail.stdin.writelines(sys.stdin) tail.stdin.close() sys.exit(tail.wait()) def indent_cpp_slop(code): proc = subprocess.Popen(['indenter-cpp'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) rv = proc.communicate(code)[0].strip().split('\n') prv = proc.wait() if prv: sys.exit(prv) return rv operators = { '#', '##', '+', '++', '+=', '-', '--', '-=', '->', '->*', '*', '*=', '/', '/=', '%', '%=', '=', '==', '!', '!=', '~', '|', '||', '|=', '&', '&&', '&=', '^', '^=', '<', '<=', '<<', '<<=', '>', '>=', '>>', '>>=', '.', '..', '.*', '...', ':', '::', '(', ')', '[', ']', '{', '}', '?', ',', ';', '//', '/*', # comments are specially handled at end } operators = { k: {v[len(k):] for v in operators if v is not k and v.startswith(k)} for k in operators } # *please* don't use any of these except and, or, and not operator_map = { 'and': '&&', 'and_eq': '&=', 'bitand': '&', 'bitor': '|', 'compl': '~', 'not': '!', 'not_eq': '!=', 'or': '||', 'or_eq': '|=', 'xor': '^', 'xor_eq': '^=', } num1 = string.digits num_x = num1 + '.\'' ident1 = string.ascii_letters + '_$@' # $@ for bison ident_x = ident1 + string.digits class CxxLexer(object): __slots__ = ('_reader', '_w', '_t', '_l', '_namespaces', '_classes') def __init__(self, reader): self._reader = reader self.adv() self._namespaces = [] self._classes = [] def get2(self): return self._w, self._t, self._l def adv(self): self._w, self._t, self._l = self.pull() def pull(self): r = self._reader white = bytearray() while True: c = r.get() if not c: return '\n', None, r.loc() if not c.isspace(): break white += c r.adv() white = str(white) black = bytearray() black += c l = r.loc() r.adv() if c in operators: while True: c = r.get() if not c or c.isspace(): break op = operators[str(black)] if c not in op: break black += c r.adv() if black == '/*': take_mlc(black, r) if black == '//': take_slc(black, r) elif c in num1: take_while(black, r, num_x) c = r.get() if c in ident1: black += c r.adv() take_while(black, r, ident_x) elif c in ident1: take_while(black, r, ident_x) c = r.get() if black in ('L', 'u8', 'u', 'U') and c == '"': black += c r.adv() take_str(black, r) elif c == '\'': take_char(black, r) elif c == '"': take_str(black, r) else: l.error('Unknown character: %r' % c) sys.exit(1) black = str(black) return white, black, l class Flavored(object): __slots__ = ('_str') def __init__(self, s): self._str = s class Control(Flavored): __slots__ = () class Binary(Flavored): __slots__ = () class Unary(Flavored): __slots__ = () class Postfix(Flavored): __slots__ = () class Type(Flavored): __slots__ = () class Value(Flavored): __slots__ = () class Literal(Value): __slots__ = () class TypeExpr(Flavored): __slots__ = () class Attr(Flavored): __slots__ = () class Def(Flavored): __slots__ = () class MatchHead(Flavored): __slots__ = ('_tail_char', '_purpose') def __init__(self, s, t, p): Flavored.__init__(self, s) assert isinstance(t, str) self._tail_char = t self._purpose = p class MatchTail(Flavored): __slots__ = ('_head_obj') def __init__(self, s, head): Flavored.__init__(self, s) assert isinstance(head, MatchHead) self._head_obj = head @property def _purpose(self): return self._head_obj._purpose class MatchTail2(Flavored): __slots__ = ('_head_inner', '_head_outer') def __init__(self, s, head_inner, head_outer): Flavored.__init__(self, s) assert isinstance(head_inner, MatchHead) assert isinstance(head_outer, MatchHead) self._head_inner = head_inner self._head_outer = head_outer @property def _purpose(self): return self._head_outer._purpose class CxxFormatter(object): __slots__ = ('_lexer', '_w', '_t', '_types', '_values', '_type_expressions', '_scopes') def __init__(self, lexer): assert isinstance(lexer, CxxLexer) self._lexer = lexer self._w = None self._t = None self._scopes = [] self._types = { 'auto', 'bool', 'char', 'char16_t', 'char32_t', 'double', 'float', 'int', 'long', 'short', 'signed', 'unsigned', 'void', 'wchar_t', } self._values = { 'alignof', 'const_cast', 'dynamic_cast', 'false', 'nullptr', 'reinterpret_cast', 'sizeof', 'static_cast', 'this', 'typeid', 'true', } self._type_expressions = { 'decltype', } # the following two functions should *generally* not access self # but they do need to a bit def flavor2(self, w, t, l): ''' Given the next token and its whitespace, calculate the flavor. Note: the need to know the preceding whitespace is a hack (but a pretty good one!) self.w, self.t, and self.f still contain the previous token. ''' if t.startswith('//') or t.startswith('/*') or (t.startswith('#') and len(t) > 2): return None if t[0] in num1 or t[0] == '\'': return Literal(t) if '"' in t: if self._t and self._t._str == 'extern': # extern "C" return Attr(t) return Literal(t) o = operator_map.get(t, t) if o in { '#', '!', '~', }: return Unary(t) if o in { '##', '+=', '-=', '->', '->*', '*=', '/', '/=', '%', '%=', '=', '==', '!=', '|', '||', '|=', '&=', '^', '^=', '<=', '<<', '<<=', '>=', '>>=', '.', '..', '.*', '::', }: return Binary(t) if o == '<': if w: return Binary(t) u = {'<': '>', '(': ')', '{': '}', '[': ']'}.get(o) if u is not None: rv = MatchHead(t, u, None) # fix this, it is CRITICAL self._scopes.append(rv) return rv if o == '>' or o == '>>': if not self._scopes or self._scopes[-1]._str != '<': return Binary(t) if o == '>>': assert len(self._scopes) >= 2 assert self._scopes[-1]._str == '<' assert self._scopes[-2]._str == '<' return MatchTail2(t, self._scopes.pop(), self._scopes.pop()) if o in {'>', ')', '}', ']'}: if not self._scopes: l.fatal('Unexpected %r' % t) if self._scopes[-1]._tail_char != t: l.fatal('Expected %r, got %r' % (self._scopes[-1]._tail_char, t)) return MatchTail(t, self._scopes.pop()) if o == '...': return Postfix(t) if o in {'*', '&', '&&'}: if isinstance(self._t, Type): return Type(t) if o in {'+', '-', '*', '&', '&&'}: # && is a gcc extension for address-of-a-label if isinstance(self._t, (Unary, Binary, Control, MatchHead)): return Unary(t) elif isinstance(self._t, (Value, Postfix, MatchTail, MatchTail2)): return Binary(t) else: l.fatal('Not sure how to handle ambiguous unary/binary after instance of %r' % self._t.__class__.__name__) if o in {'--', '++'}: if isinstance(self._t, (Unary, Binary, Control, MatchHead)): return Unary(t) elif isinstance(self._t, (Value, Postfix, MatchTail, MatchTail2)): return Postfix(t) else: l.fatal('Not sure how to handle ambiguous prefix/postfix after instance of %r' % self._t.__class__.__name__) if o in {',', ';'}: return Binary(t) if o == '?': return Binary(t) if o == ':': return Binary(t) assert t == o assert t[0] in ident1 assert all(c in ident_x for c in t[1:]) # keywords! if t == '__attribute__': return Attr(t) if t in { 'alignas', 'constexpr', 'explicit', 'export', 'extern', 'friend', 'inline', 'mutable', 'register', 'static', 'thread_local', 'virtual', }: return Attr(t) if t in { 'const', 'volatile', }: if self._t is None or isinstance(self._t, (Attr, Binary, MatchTail, MatchTail2)): # ; is binary return Attr(t) if isinstance(self._t, Def): # trailing function return Attr(t) return Type(t) if t in {'final', 'override'}: # theoretically, should only do this for thingy return Attr(t) if t == 'noexcept': if isinstance(self._t, (Attr, MatchTail, MatchTail2)): return Attr(t) else: return Value(t) if t == 'asm': return Value(t) if t in {'delete', 'default'} and isinstance(self._t, Binary): # = delete return Value(t) if t in {'new', 'delete'}: return Unary(t) if t in {'case', 'goto', 'return', 'throw'}: return Unary(t) if t in {'default', 'public', 'private', 'protected'}: return Value(t) if t in {'break', 'continue'}: return Value(t) if t in {'try', 'catch', 'do', 'else', 'for', 'if', 'switch', 'while'}: return Control(t) if t in {'class', 'enum', 'struct', 'typename', 'union'}: return Def(t) if t == 'static_assert': return Value(t) if t == 'operator': return Value(t) if t == 'namespace': return Def(t) if t == 'template': return Def(t) if t == 'typedef': return Def(t) if t == 'using': return Unary(t) if t in self._type_expressions: return TypeExpr(t) # types, values, and keywords that act like one of those if t in self._types: return Type(t) if t in self._values: return Value(t) u = t.replace('_', '') if u.isupper(): return Value(t) if u and u[0].isupper(): return Type(t) return Value(t) def whitespace(self, pt, t): ''' Given a token and its flavor, calculate its whitespace. ''' w = self._w # TODO set to '' instead to force calculation for func in [ ]: w = func(w, pt, t) return w def format_ii(): r = Reader('', sys.stdin) l = CxxLexer(r) f = CxxFormatter(l) while True: wspace, raw_tok, loc = l.get2() assert isinstance(wspace, str) if raw_tok is None: break assert isinstance(raw_tok, str) l.adv() prev_tok = f._t cooked_tok = f.flavor2(wspace, raw_tok, loc) if cooked_tok is None: f._w = wspace # or ' ' # f._t is unchanged else: f._w = wspace f._t = cooked_tok wspace = f.whitespace(prev_tok, cooked_tok) if cooked_tok is None: sys.stdout.writelines([wspace, raw_tok]) else: sys.stdout.writelines([wspace, cooked_tok._str]) if 1: sys.stdout.write('\n') exts = { '-lpp': format_lex, '-ypp': format_yacc, '-cpp': format_cc, '-ipp': format_ii, } if __name__ == '__main__': import sys if len(sys.argv) != 2: sys.exit('Usage: %s -ext < input.ext > output.ext') func = exts.get(sys.argv[1]) if not func: sys.exit('Bad -ext') func()