path: root/tools/indenter



#!/usr/bin/env python
# -*- encoding: utf-8
##    indenter.py - Top-level indenter for all files
##
##    Copyright ©2013 Ben Longbons <b.r.longbons@gmail.com>
##
##    This file is part of The Mana World (Athena server)
##
##    This program is free software: you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation, either version 3 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with this program.  If not, see <http://www.gnu.org/licenses/>.


from __future__ import print_function

from collections import namedtuple
import cStringIO
import string
import subprocess
import sys


# Settings.
class LexSettings:
    pad = 2
    indent = 4
    brace = True
    nested_indent = 0 # 4


# Code.
Location = namedtuple('Location', ('name', 'line', 'column', 'text'))
if 1:
    def _diagnostic(self, level, msg, to):
        print('{file}:{line}:{column}: {level}: {msg}'.format(
                file=self.name, line=self.line, column=self.column,
                level=level, msg=msg),
            file=sys.stderr)
        print(self.text, file=sys.stderr)
        if to:
            assert to.name == self.name
            assert to.line == self.line
            assert to.column >= self.column
        else:
            to = self
        print(' ' * (self.column - 1) + '^' + '~' * (to.column - self.column), file=sys.stderr)
    def error(self, msg, to=None):
        self._diagnostic('error', msg, to)
    def warning(self, msg, to=None):
        self._diagnostic('warning', msg, to)
    def note(self, msg, to=None):
        self._diagnostic('note', msg, to)
    Location._diagnostic = _diagnostic
    Location.error = error
    Location.warning = warning
    Location.note = note
    del _diagnostic, error, warning, note


class Reader(object):
    __slots__ = ('_name', '_stream', '_buffer', '_line', '_column')
    def __init__(self, name, stream, line=1, column=1):
        ''' Create a new character reader that is smart with lines.
        '''
        self._name = name
        self._stream = stream
        self._buffer = '\n'
        self._line = line - 1
        self._column = 0

        column -= 1
        self.adv()
        self._buffer = ' ' * column + self._buffer
        self._column = column
        # no skew on input (actually belongs below)

    def get(self):
        ''' Fetch the current character, or falsy on EOF
        '''
        if self._buffer:
            return self._buffer[self._column]
        else:
            return None # less prone to accidental errors than ''

    def loc(self):
        ''' Fetch the Location of the current character.
        '''
        # internally we store 0-based, but users want 1-based
        # also, cut off the newline
        return Location(self._name, self._line, self._column + 1,
                self._buffer[:-1])

    def adv(self):
        if self._buffer[self._column] == '\n':
            self._buffer = self._stream.readline()
            self._line += 1
            self._column = 0
            if self._buffer and not self._buffer.endswith('\n'):
                self._buffer += '\n'
        else:
            self._column += 1

def string_reader(s, name='<string>', line=1, column=1):
    return Reader(name, cStringIO.StringIO(s), line, column)

def take_while(b, r, s):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)
    s = frozenset(s)
    while True:
        c = r.get()
        if not c or c not in s:
            break
        b += c
        r.adv()

def take_mlc(b, r):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)

    star = False
    while True:
        c = r.get()
        r.adv()
        b += c
        if star and c == '/':
            return
        star = c == '*'

def take_slc(b, r):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)

    bs = False
    while True:
        c = r.get()
        # if c == '\n': return
        r.adv()
        b += c
        if c == '\n' and not bs:
            return
        bs = c == '\\'

def take_char(b, r):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)

    bs = False
    while True:
        c = r.get()
        r.adv()
        b += c
        if not bs and c == '\'':
            return
        bs = not bs and c == '\\'

def take_str(b, r):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)

    bs = False
    while True:
        c = r.get()
        r.adv()
        b += c
        if not bs and c == '"':
            return
        bs = not bs and c == '\\'


def round_up(i, a):
    m = i % a
    if m:
        i += (a - m)
    return i

class Table:
    ''' Aligned output
    '''
    def __init__(self):
        self.buf = []
        self.size = 0
    def put1(self, line):
        line = line.rstrip()
        self.buf.append((line, ''))
    def put2(self, left, right):
        left = left.rstrip()
        right = right.strip()
        self.buf.append((left, right))
        if right and len(left) > self.size:
            self.size = len(left)
    def flush(self):
        self.size += LexSettings.pad
        self.size = round_up(self.size, LexSettings.indent)
        for l, r in self.buf:
            if not r:
                sys.stdout.writelines([l, '\n'])
            else:
                need = self.size - len(l)
                sys.stdout.writelines([l, ' ' * need, r, '\n'])
        del self.buf[:]
        self.size = 0

def format_lex_or_yacc_definitions():
    'definitions section (mostly used for options actually)'
    table = Table()
    in_code = False
    code = bytearray()
    for line in sys.stdin:
        if line == '%%\n':
            break
        if line == '%{\n':
            in_code = True
            continue
        if in_code:
            if line == '%}\n':
                in_code = False
                continue
            code += line
            continue
        if not line.strip() or line != line.lstrip():
            # starts with whitespace or is an empty line ('\n')
            code += line
            continue
        if code.strip():
            if LexSettings.brace:
                table.put1('%{')
            for line2 in indent_cpp_slop(code):
                table.put1(LexSettings.nested_indent * ' ' + line2)
            if LexSettings.brace:
                table.put1('%}')
                table.put1('')
        elif code:
            table.put1('')
        code = bytearray()

        if line.startswith('%'):
            # %top is flex, %code and %union are bison
            union = line.startswith('%union')
            if union or line.startswith('%top') or line.startswith('%code'):
                # TODO fix stupidity when in strings or comments
                count = line.count('{')
                #code = bytearray()
                if union:
                    assert count <= 1
                    code += line[1:]
                else:
                    if count:
                        assert count == 1
                        code += line[line.find('{'):]
                        table.put1(line[:line.find('{')])
                    else:
                        table.put1(line.rstrip())
                assert line.count('}') == 0
                for line in sys.stdin:
                    count += line.count('{') - line.count('}')
                    code += line
                    assert count >= 0
                    if count == 0:
                        break
                if union:
                    first = True
                    for line2 in indent_cpp_slop(code):
                        if first:
                            line2 = '%' + line2
                            first = False
                        table.put1(line2)
                else:
                    for line2 in indent_cpp_slop(code):
                        table.put1(LexSettings.nested_indent * ' ' + line2)
                code = bytearray()
            else:
                table.put1(line)
        elif line[0].isalpha() or line[0] == '_':
            table.put2(*line.split(None, 1))
        else:
            table.put1(line)

    assert not in_code
    del code
    del in_code
    table.flush()
    sys.stdout.write('\n%%\n')

def format_lex_rules():
    'rule section'
    table = Table()
    for line in sys.stdin:
        if line == '%%\n':
            break
        if line.startswith('<') and not line.startswith('<<'):
            raise NotImplementedError('start conditions not yet supported')
        i = 0
        p = 0
        bs = False
        while True:
            if bs:
                bs = False
                i += 1
                continue
            if line[i] == '\\':
                bs = True
                i += 1
                continue
            if not p and line[i].isspace():
                break
            if line[i] == '"':
                i += 1
                while line[i] != '"':
                    if line[i] == '\\':
                        i += 1
                    i += 1
            elif line[i] == '[':
                i += 1
                if line[i] == '^':
                    i += 1
                while line[i] != ']':
                    i += 1
            elif line[i] == '(':
                p += 1
            elif line[i] == ')':
                assert p
                p -= 1
            i += 1
        if not i:
            table.put1('')
            continue
        del bs
        del p
        pattern = line[:i]
        rule = bytearray(line[i:])
        del i
        count = rule.count('{') - rule.count('}')
        while count:
            blah = next(sys.stdin)
            rule += blah
            count += blah.count('{') - blah.count('}')
        rules = indent_cpp_slop(rule)
        table.put2(pattern, rules[0])
        for line in rules[1:]:
            table.put1(line)

    table.flush()
    sys.stdout.write('%%\n')

def format_yacc_rules():
    '''
        tokens are any of:
        word
        word[namedref]
        'c'
        "str"
        { code }
        break before {
        break twice before a : or |
        break twice before and thrice after a ;
        put a softspace after everything except ;
    '''
    sys.stdout.write('\n')
    softspace = '' # NOT reset by new lines
    for line in sys.stdin:
        if line == '%%\n':
            break
        line = line.strip()
        while line:
            if line.startswith("'"):
                bs = False
                for i, c in enumerate(line):
                    if bs:
                        continue
                    bs = c == '\\'
                    if i and c == "'":
                        break
                else:
                    raise Exception('broken char')
                i += 1
                word = line[:i]
                line = line[i:].lstrip()
                sys.stdout.writelines([softspace, word])
                softspace = ' '
                continue
            if line.startswith('"'):
                for i, c in enumerate(line):
                    if bs:
                        continue
                    bs = c == '\\'
                    if i and c == '"':
                        break
                else:
                    raise Exception('broken string')
                i += 1
                word = line[:i]
                line = line[i:].lstrip()
                sys.stdout.writelines([softspace, word])
                softspace = ' '
                continue
            if line.startswith(':'):
                line = line[1:].lstrip()
                sys.stdout.write('\n\n:')
                softspace = ' '
                continue
            if line.startswith('{'):
                line += '\n'
                lines = bytearray()
                # TODO fix braces in comments and strings
                lo = 1
                behold = 1
                while behold:
                    i = line.find('}', lo)
                    if i == -1:
                        behold += line[lo:].count('{')
                        lines += line
                        line = next(sys.stdin)
                        lo = 0
                    else:
                        behold -= 1
                        i += 1
                        behold += line[lo:i].count('{')
                        lo = i
                lines += line[:lo]
                for line2 in indent_cpp_slop(lines):
                    sys.stdout.writelines(['\n', line2])
                line = line[lo:].strip()
                softspace = ' '
                continue
            if line.startswith(';'):
                line = line[1:].lstrip()
                sys.stdout.write('\n\n;\n\n\n')
                softspace = ''
                continue
            if line.startswith('|'):
                line = line[1:].lstrip()
                sys.stdout.write('\n\n|')
                softspace = ' '
                continue
            # screw comments
            word, _, line = line.partition(' ')
            line = line.lstrip()
            if word.endswith(':'):
                word = word[-1]
                line = ':' + line
            sys.stdout.writelines([softspace, word])
            softspace = ' '
            continue
        # while line
    # for line in stdin
    sys.stdout.write('%%\n')

def format_lex():
    '''
    A lex file is a series of sections.

    In the initial section:
        If it begins with whitespace, it is indented code
        It might be a /* comment */
        It might be a #line
        It might be a %s, %x, %pointer, %array, %option %[a-z][0-9].*
        It might be a %{ codeblock %}
        It might be a %top { codeblock }
        It might be a name and an expansion
        A %% switches to the second section

    In a comment:
        */ is the end

    In a codeblock:
        if it started with %{, %} ends it
        if it started with %top{, } ends it if it matches the nesting

    In section 2's header:
        there may be %{ %} sections, possibly nested
        there may also be indented code
        there may be unindented code if it's inside the %{ %}

    In section 2 proper:
        pattern         action
        <sc>pattern     action
        <sc>{
            pattern     action
        }
        a %% switches to section 3

    In section 3:
        everything is just C code
    '''

    format_lex_or_yacc_definitions()
    format_lex_rules()
    format_cc()

def format_yacc():
    ''' A yacc file is a series of sections.

    In the initial section:
        whitespace and comments are ignored.
        %someoption
        =
        |
        ;
        name
        name:
        int

        'char'
        "string"
        <*>
        <>
        <something>
        %{ prologue %}
        { braced code }
        [ bracketed identifier ]
        %% switch to section 2

    In the second section:
        is actually the same! wtf?
        But in practice:

        name:
            symbol 'c' "str" { code }
        |
            symbol 'c' "str" { code } /* in any order */
        ;
        any name may instead be name[namedref]
        code may additionally contain $$, $1, $namedref

    In section 3:
        everything is C code.
    '''
    format_lex_or_yacc_definitions()
    format_yacc_rules()
    format_cc()

def format_cc():
    sys.stdout.flush()
    tail = subprocess.Popen(['indenter-cpp'], stdin=subprocess.PIPE, stdout=None)
    tail.stdin.writelines(sys.stdin)
    tail.stdin.close()
    sys.exit(tail.wait())

def indent_cpp_slop(code):
    return subprocess.Popen(['indenter-cpp'], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate(code)[0].strip().split('\n')

operators = {
    '#', '##',
    '+', '++', '+=',
    '-', '--', '-=', '->', '->*',
    '*', '*=',
    '/', '/=',
    '%', '%=',
    '=', '==',
    '!', '!=',
    '~',
    '|', '||', '|=',
    '&', '&&', '&=',
    '^', '^=',
    '<', '<=', '<<', '<<=',
    '>', '>=', '>>', '>>=',
    '.', '..', '.*', '...',
    ':', '::',
    '(', ')',
    '[', ']',
    '{', '}',
    '?',
    ',', ';',

    '//', '/*', # comments are specially handled at end
}
operators = {
    k: {v[len(k):] for v in operators if v is not k and v.startswith(k)}
    for k in operators
}

num1 = string.digits
num_x = num1 + '.\''
ident1 = string.ascii_letters + '_$@' # $@ for bison
ident_x = ident1 + string.digits

class CxxLexer(object):
    __slots__ = ('_reader', '_w', '_t', '_f', '_namespaces', '_classes')

    def __init__(self, reader):
        self._reader = reader
        self.adv()
        self._namespaces = []
        self._classes = []

    def get(self):
        return self._w, self._t, self._f

    def adv(self):
        self._w, self._t, self._f = self.pull()

    def pull(self):
        r = self._reader

        white = bytearray()
        while True:
            c = r.get()
            if not c:
                return '\n', None, None
            if not c.isspace():
                break
            white += c
            r.adv()

        black = bytearray()
        black += c
        l = r.loc()
        r.adv()

        if c in operators:
            while True:
                c = r.get()
                if not c or c.isspace():
                    break
                op = operators[str(black)]
                if c not in op:
                    break
                black += c
                r.adv()
            if black == '/*':
                take_mlc(black, r)
            if black == '//':
                take_slc(black, r)
        elif c in num1:
            take_while(black, r, num_x)
            c = r.get()
            if c in ident1:
                black += c
                r.adv()
                take_while(black, r, ident_x)
        elif c in ident1:
            take_while(black, r, ident_x)
            c = r.get()
            if black in ('L', 'u8', 'u', 'U') and c == '"':
                black += c
                r.adv()
                take_str(black, r)
        elif c == '\'':
            take_char(black, r)
        elif c == '"':
            take_str(black, r)
        else:
            l.error('Unknown character: %r' % c)
            sys.exit(1)

        # c is the first char of the next thing
        return white, black, None

def whitespace(w, (t, f), (pt, pf)):
    return w

def format_ii():
    r = Reader('<stdin>', sys.stdin)
    l = CxxLexer(r)
    pt = None
    pf = None
    while True:
        w, t, f = l.get()
        if not t:
            break
        l.adv()

        w = whitespace(w, (t, f), (pt, pf))
        sys.stdout.writelines([w, t])
        #print('w:', repr(str(w)))
        #print('t:', t)
        pt, pf = t, f
    if not pt.endswith('\n'):
        sys.stdout.write('\n')

exts = {
        '-lpp': format_lex,
        '-ypp': format_yacc,
        '-cpp': format_cc,
        '-ipp': format_ii,
}

if __name__ == '__main__':
    import sys
    if len(sys.argv) != 2:
        sys.exit('Usage: %s -ext < input.ext > output.ext')
    func = exts.get(sys.argv[1])
    if not func:
        sys.exit('Bad -ext')
    func()
#!/usr/bin/env python
# -*- encoding: utf-8
##    indenter.py - Top-level indenter for all files
##
##    Copyright ©2013 Ben Longbons <b.r.longbons@gmail.com>
##
##    This file is part of The Mana World (Athena server)
##
##    This program is free software: you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation, either version 3 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with this program.  If not, see <http://www.gnu.org/licenses/>.


from __future__ import print_function

from collections import namedtuple
import cStringIO
import string
import subprocess
import sys


# Settings.
class LexSettings:
    pad = 2
    indent = 4
    brace = True
    nested_indent = 0 # 4


# Code.
Location = namedtuple('Location', ('name', 'line', 'column', 'text'))
if 1:
    def _diagnostic(self, level, msg, to):
        print('{file}:{line}:{column}: {level}: {msg}'.format(
                file=self.name, line=self.line, column=self.column,
                level=level, msg=msg),
            file=sys.stderr)
        print(self.text, file=sys.stderr)
        if to:
            assert to.name == self.name
            assert to.line == self.line
            assert to.column >= self.column
        else:
            to = self
        print(' ' * (self.column - 1) + '^' + '~' * (to.column - self.column), file=sys.stderr)
    def error(self, msg, to=None):
        self._diagnostic('error', msg, to)
    def warning(self, msg, to=None):
        self._diagnostic('warning', msg, to)
    def note(self, msg, to=None):
        self._diagnostic('note', msg, to)
    Location._diagnostic = _diagnostic
    Location.error = error
    Location.warning = warning
    Location.note = note
    del _diagnostic, error, warning, note


class Reader(object):
    __slots__ = ('_name', '_stream', '_buffer', '_line', '_column')
    def __init__(self, name, stream, line=1, column=1):
        ''' Create a new character reader that is smart with lines.
        '''
        self._name = name
        self._stream = stream
        self._buffer = '\n'
        self._line = line - 1
        self._column = 0

        column -= 1
        self.adv()
        self._buffer = ' ' * column + self._buffer
        self._column = column
        # no skew on input (actually belongs below)

    def get(self):
        ''' Fetch the current character, or falsy on EOF
        '''
        if self._buffer:
            return self._buffer[self._column]
        else:
            return None # less prone to accidental errors than ''

    def loc(self):
        ''' Fetch the Location of the current character.
        '''
        # internally we store 0-based, but users want 1-based
        # also, cut off the newline
        return Location(self._name, self._line, self._column + 1,
                self._buffer[:-1])

    def adv(self):
        if self._buffer[self._column] == '\n':
            self._buffer = self._stream.readline()
            self._line += 1
            self._column = 0
            if self._buffer and not self._buffer.endswith('\n'):
                self._buffer += '\n'
        else:
            self._column += 1

def string_reader(s, name='<string>', line=1, column=1):
    return Reader(name, cStringIO.StringIO(s), line, column)

def take_while(b, r, s):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)
    s = frozenset(s)
    while True:
        c = r.get()
        if not c or c not in s:
            break
        b += c
        r.adv()

def take_mlc(b, r):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)

    star = False
    while True:
        c = r.get()
        r.adv()
        b += c
        if star and c == '/':
            return
        star = c == '*'

def take_slc(b, r):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)

    bs = False
    while True:
        c = r.get()
        # if c == '\n': return
        r.adv()
        b += c
        if c == '\n' and not bs:
            return
        bs = c == '\\'

def take_char(b, r):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)

    bs = False
    while True:
        c = r.get()
        r.adv()
        b += c
        if not bs and c == '\'':
            return
        bs = not bs and c == '\\'

def take_str(b, r):
    assert isinstance(b, bytearray)
    assert isinstance(r, Reader)

    bs = False
    while True:
        c = r.get()
        r.adv()
        b += c
        if not bs and c == '"':
            return
        bs = not bs and c == '\\'


def round_up(i, a):
    m = i % a
    if m:
        i += (a - m)
    return i

class Table:
    ''' Aligned output
    '''
    def __init__(self):
        self.buf = []
        self.size = 0
    def put1(self, line):
        line = line.rstrip()
        self.buf.append((line, ''))
    def put2(self, left, right):
        left = left.rstrip()
        right = right.strip()
        self.buf.append((left, right))
        if right and len(left) > self.size:
            self.size = len(left)
    def flush(self):
        self.size += LexSettings.pad
        self.size = round_up(self.size, LexSettings.indent)
        for l, r in self.buf:
            if not r:
                sys.stdout.writelines([l, '\n'])
            else:
                need = self.size - len(l)
                sys.stdout.writelines([l, ' ' * need, r, '\n'])
        del self.buf[:]
        self.size = 0

def format_lex_or_yacc_definitions():
    'definitions section (mostly used for options actually)'
    table = Table()
    in_code = False
    code = bytearray()
    for line in sys.stdin:
        if line == '%%\n':
            break
        if line == '%{\n':
            in_code = True
            continue
        if in_code:
            if line == '%}\n':
                in_code = False
                continue
            code += line
            continue
        if not line.strip() or line != line.lstrip():
            # starts with whitespace or is an empty line ('\n')
            code += line
            continue
        if code.strip():
            if LexSettings.brace:
                table.put1('%{')
            for line2 in indent_cpp_slop(code):
                table.put1(LexSettings.nested_indent * ' ' + line2)
            if LexSettings.brace:
                table.put1('%}')
                table.put1('')
        elif code:
            table.put1('')
        code = bytearray()

        if line.startswith('%'):
            # %top is flex, %code and %union are bison
            union = line.startswith('%union')
            if union or line.startswith('%top') or line.startswith('%code'):
                # TODO fix stupidity when in strings or comments
                count = line.count('{')
                #code = bytearray()
                if union:
                    assert count <= 1
                    code += line[1:]
                else:
                    if count:
                        assert count == 1
                        code += line[line.find('{'):]
                        table.put1(line[:line.find('{')])
                    else:
                        table.put1(line.rstrip())
                assert line.count('}') == 0
                for line in sys.stdin:
                    count += line.count('{') - line.count('}')
                    code += line
                    assert count >= 0
                    if count == 0:
                        break
                if union:
                    first = True
                    for line2 in indent_cpp_slop(code):
                        if first:
                            line2 = '%' + line2
                            first = False
                        table.put1(line2)
                else:
                    for line2 in indent_cpp_slop(code):
                        table.put1(LexSettings.nested_indent * ' ' + line2)
                code = bytearray()
            else:
                table.put1(line)
        elif line[0].isalpha() or line[0] == '_':
            table.put2(*line.split(None, 1))
        else:
            table.put1(line)

    assert not in_code
    del code
    del in_code
    table.flush()
    sys.stdout.write('\n%%\n')

def format_lex_rules():
    'rule section'
    table = Table()
    for line in sys.stdin:
        if line == '%%\n':
            break
        if line.startswith('<') and not line.startswith('<<'):
            raise NotImplementedError('start conditions not yet supported')
        i = 0
        p = 0
        bs = False
        while True:
            if bs:
                bs = False
                i += 1
                continue
            if line[i] == '\\':
                bs = True
                i += 1
                continue
            if not p and line[i].isspace():
                break
            if line[i] == '"':
                i += 1
                while line[i] != '"':
                    if line[i] == '\\':
                        i += 1
                    i += 1
            elif line[i] == '[':
                i += 1
                if line[i] == '^':
                    i += 1
                while line[i] != ']':
                    i += 1
            elif line[i] == '(':
                p += 1
            elif line[i] == ')':
                assert p
                p -= 1
            i += 1
        if not i:
            table.put1('')
            continue
        del bs
        del p
        pattern = line[:i]
        rule = bytearray(line[i:])
        del i
        count = rule.count('{') - rule.count('}')
        while count:
            blah = next(sys.stdin)
            rule += blah
            count += blah.count('{') - blah.count('}')
        rules = indent_cpp_slop(rule)
        table.put2(pattern, rules[0])
        for line in rules[1:]:
            table.put1(line)

    table.flush()
    sys.stdout.write('%%\n')

def format_yacc_rules():
    '''
        tokens are any of:
        word
        word[namedref]
        'c'
        "str"
        { code }
        break before {
        break twice before a : or |
        break twice before and thrice after a ;
        put a softspace after everything except ;
    '''
    sys.stdout.write('\n')
    softspace = '' # NOT reset by new lines
    for line in sys.stdin:
        if line == '%%\n':
            break
        line = line.strip()
        while line:
            if line.startswith("'"):
                bs = False
                for i, c in enumerate(line):
                    if bs:
                        continue
                    bs = c == '\\'
                    if i and c == "'":
                        break
                else:
                    raise Exception('broken char')
                i += 1
                word = line[:i]
                line = line[i:].lstrip()
                sys.stdout.writelines([softspace, word])
                softspace = ' '
                continue
            if line.startswith('"'):
                for i, c in enumerate(line):
                    if bs:
                        continue
                    bs = c == '\\'
                    if i and c == '"':
                        break
                else:
                    raise Exception('broken string')
                i += 1
                word = line[:i]
                line = line[i:].lstrip()
                sys.stdout.writelines([softspace, word])
                softspace = ' '
                continue
            if line.startswith(':'):
                line = line[1:].lstrip()
                sys.stdout.write('\n\n:')
                softspace = ' '
                continue
            if line.startswith('{'):
                line += '\n'
                lines = bytearray()
                # TODO fix braces in comments and strings
                lo = 1
                behold = 1
                while behold:
                    i = line.find('}', lo)
                    if i == -1:
                        behold += line[lo:].count('{')
                        lines += line
                        line = next(sys.stdin)
                        lo = 0
                    else:
                        behold -= 1
                        i += 1
                        behold += line[lo:i].count('{')
                        lo = i
                lines += line[:lo]
                for line2 in indent_cpp_slop(lines):
                    sys.stdout.writelines(['\n', line2])
                line = line[lo:].strip()
                softspace = ' '
                continue
            if line.startswith(';'):
                line = line[1:].lstrip()
                sys.stdout.write('\n\n;\n\n\n')
                softspace = ''
                continue
            if line.startswith('|'):
                line = line[1:].lstrip()
                sys.stdout.write('\n\n|')
                softspace = ' '
                continue
            # screw comments
            word, _, line = line.partition(' ')
            line = line.lstrip()
            if word.endswith(':'):
                word = word[-1]
                line = ':' + line
            sys.stdout.writelines([softspace, word])
            softspace = ' '
            continue
        # while line
    # for line in stdin
    sys.stdout.write('%%\n')

def format_lex():
    '''
    A lex file is a series of sections.

    In the initial section:
        If it begins with whitespace, it is indented code
        It might be a /* comment */
        It might be a #line
        It might be a %s, %x, %pointer, %array, %option %[a-z][0-9].*
        It might be a %{ codeblock %}
        It might be a %top { codeblock }
        It might be a name and an expansion
        A %% switches to the second section

    In a comment:
        */ is the end

    In a codeblock:
        if it started with %{, %} ends it
        if it started with %top{, } ends it if it matches the nesting

    In section 2's header:
        there may be %{ %} sections, possibly nested
        there may also be indented code
        there may be unindented code if it's inside the %{ %}

    In section 2 proper:
        pattern         action
        <sc>pattern     action
        <sc>{
            pattern     action
        }
        a %% switches to section 3

    In section 3:
        everything is just C code
    '''

    format_lex_or_yacc_definitions()
    format_lex_rules()
    format_cc()

def format_yacc():
    ''' A yacc file is a series of sections.

    In the initial section:
        whitespace and comments are ignored.
        %someoption
        =
        |
        ;
        name
        name:
        int

        'char'
        "string"
        <*>
        <>
        <something>
        %{ prologue %}
        { braced code }
        [ bracketed identifier ]
        %% switch to section 2

    In the second section:
        is actually the same! wtf?
        But in practice:

        name:
            symbol 'c' "str" { code }
        |
            symbol 'c' "str" { code } /* in any order */
        ;
        any name may instead be name[namedref]
        code may additionally contain $$, $1, $namedref

    In section 3:
        everything is C code.
    '''
    format_lex_or_yacc_definitions()
    format_yacc_rules()
    format_cc()

def format_cc():
    sys.stdout.flush()
    tail = subprocess.Popen(['indenter-cpp'], stdin=subprocess.PIPE, stdout=None)
    tail.stdin.writelines(sys.stdin)
    tail.stdin.close()
    sys.exit(tail.wait())

def indent_cpp_slop(code):
    return subprocess.Popen(['indenter-cpp'], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate(code)[0].strip().split('\n')

operators = {
    '#', '##',
    '+', '++', '+=',
    '-', '--', '-=', '->', '->*',
    '*', '*=',
    '/', '/=',
    '%', '%=',
    '=', '==',
    '!', '!=',
    '~',
    '|', '||', '|=',
    '&', '&&', '&=',
    '^', '^=',
    '<', '<=', '<<', '<<=',
    '>', '>=', '>>', '>>=',
    '.', '..', '.*', '...',
    ':', '::',
    '(', ')',
    '[', ']',
    '{', '}',
    '?',
    ',', ';',

    '//', '/*', # comments are specially handled at end
}
operators = {
    k: {v[len(k):] for v in operators if v is not k and v.startswith(k)}
    for k in operators
}

num1 = string.digits
num_x = num1 + '.\''
ident1 = string.ascii_letters + '_$@' # $@ for bison
ident_x = ident1 + string.digits

class CxxLexer(object):
    __slots__ = ('_reader', '_w', '_t', '_f', '_namespaces', '_classes')

    def __init__(self, reader):
        self._reader = reader
        self.adv()
        self._namespaces = []
        self._classes = []

    def get(self):
        return self._w, self._t, self._f

    def adv(self):
        self._w, self._t, self._f = self.pull()

    def pull(self):
        r = self._reader

        white = bytearray()
        while True:
            c = r.get()
            if not c:
                return '\n', None, None
            if not c.isspace():
                break
            white += c
            r.adv()

        black = bytearray()
        black += c
        l = r.loc()
        r.adv()

        if c in operators:
            while True:
                c = r.get()
                if not c or c.isspace():
                    break
                op = operators[str(black)]
                if c not in op:
                    break
                black += c
                r.adv()
            if black == '/*':
                take_mlc(black, r)
            if black == '//':
                take_slc(black, r)
        elif c in num1:
            take_while(black, r, num_x)
            c = r.get()
            if c in ident1:
                black += c
                r.adv()
                take_while(black, r, ident_x)
        elif c in ident1:
            take_while(black, r, ident_x)
            c = r.get()
            if black in ('L', 'u8', 'u', 'U') and c == '"':
                black += c
                r.adv()
                take_str(black, r)
        elif c == '\'':
            take_char(black, r)
        elif c == '"':
            take_str(black, r)
        else:
            l.error('Unknown character: %r' % c)
            sys.exit(1)

        # c is the first char of the next thing
        return white, black, None

def whitespace(w, (t, f), (pt, pf)):
    return w

def format_ii():
    r = Reader('<stdin>', sys.stdin)
    l = CxxLexer(r)
    pt = None
    pf = None
    while True:
        w, t, f = l.get()
        if not t:
            break
        l.adv()

        w = whitespace(w, (t, f), (pt, pf))
        sys.stdout.writelines([w, t])
        #print('w:', repr(str(w)))
        #print('t:', t)
        pt, pf = t, f
    if not pt.endswith('\n'):
        sys.stdout.write('\n')

exts = {
        '-lpp': format_lex,
        '-ypp': format_yacc,
        '-cpp': format_cc,
        '-ipp': format_ii,
}

if __name__ == '__main__':
    import sys
    if len(sys.argv) != 2:
        sys.exit('Usage: %s -ext < input.ext > output.ext')
    func = exts.get(sys.argv[1])
    if not func:
        sys.exit('Bad -ext')
    func()