From ed81f91ac3c470e00df0d5c5cf9c274ce30f47b3 Mon Sep 17 00:00:00 2001
From: Ben Longbons <b.r.longbons@gmail.com>
Date: Fri, 16 Sep 2011 14:36:23 -0700
Subject: Reimplement aligncsv in C++ instead of python.

The main advantage of the new one is that it properly handles
{script arg1, arg2;}
---
 tools/.gitignore   |   1 +
 tools/aligncsv.cpp | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 tools/aligncsv.py  |  88 --------------------------
 3 files changed, 178 insertions(+), 88 deletions(-)
 create mode 100644 tools/.gitignore
 create mode 100644 tools/aligncsv.cpp
 delete mode 100755 tools/aligncsv.py

(limited to 'tools')
diff --git a/tools/.gitignore b/tools/.gitignore
new file mode 100644
index 00000000..00e2a6af
--- /dev/null
+++ b/tools/.gitignore
@@ -0,0 +1 @@
+/aligncsv
diff --git a/tools/aligncsv.cpp b/tools/aligncsv.cpp
new file mode 100644
index 00000000..e65c2605
--- /dev/null
+++ b/tools/aligncsv.cpp
@@ -0,0 +1,177 @@
+#include <cerrno>
+#include <cstdio>
+#include <cstddef>
+
+#include <vector>
+#include <string>
+
+// this configuration puts 3-6 spaces between entries (excluding headers)
+// and rounds the start of each field up to 4, for easier manual indenting
+const size_t min_pad = 3;
+const size_t align_pad = 4;
+
+void add_pieces(std::vector<std::string>& line, std::vector<size_t>& sizes)
+{
+    // This would get rid of trailing commas,
+    // but that would break certain db.txt files.
+    // Instead we'll have to manually check whether it's empty when checking length
+//     if (!line.empty() && line.back().empty())
+//         line.pop_back();
+    size_t num_sizes = line.size();
+    if (!num_sizes) // line.empty()
+        return;
+    if (line[0].size() >= 2
+        && (line[0][0] == '#'
+            || (line[0][0] == '/'
+                && line[0][1] == '/')))
+        return;
+
+    if (num_sizes > sizes.size())
+        sizes.resize(num_sizes, 1UL);
+    for (size_t i = 0; i < num_sizes; ++i)
+    {
+        size_t elt_size = line[i].size();
+        if (!elt_size)// line[i].empty()
+            continue;
+        if (line[i][elt_size - 1] == ' ')
+            line[i].resize(--elt_size);
+        // mandatory padding and comma
+        elt_size += min_pad + 1;
+        if (elt_size > sizes[i])
+            // always true if we expanded sizes
+            sizes[i] = elt_size;
+    }
+}
+
+// the arguments may be the same file - the whole file is stored in memory
+void aligncsv(FILE *in, FILE *out, const char *name)
+{
+    bool newline = true;
+    bool can_split = true;
+    bool can_have_whitespace = false;
+    int c;
+    std::vector<std::vector<std::string> > contents;
+
+    while ((c = fgetc(in)) != -1)
+    {
+        if (c == '}' || c == '\n')
+            can_split = true;
+        if (c == '\n')
+        {
+            if (newline)
+            {
+                // preserve consecutive blank lines
+                contents.push_back(std::vector<std::string>());
+            }
+            newline = true;
+            continue;
+        }
+        if (c == '{')
+            can_split = false;
+        if (c == '\t')
+            c = ' ';
+        if (c == ' ')
+        {
+            if (!can_have_whitespace)
+                continue;
+            can_have_whitespace = false;
+        }
+        else
+            can_have_whitespace = true;
+        if (newline)
+        {
+            contents.push_back(std::vector<std::string>(1, std::string(1, c)));
+            newline = false;
+        }
+        else
+        {
+            if (can_split && c == ',')
+            {
+                can_have_whitespace = false;
+                contents.back().push_back(std::string());
+            }
+            else
+                contents.back().back() += c;
+        }
+    }
+
+    typedef std::vector<std::vector<std::string> >::iterator outer_it;
+    typedef std::vector<std::vector<std::string> >::const_iterator outer_cit;
+    typedef std::vector<size_t>::iterator pieces_it;
+    // at this point, each entry in a line:
+    // * does not start with whitespace
+    // * has one space in place of any previous run of whitespace
+    // * may end in a single space
+    // The last is fixed during add_pieces
+    std::vector<size_t> pieces;
+    for (outer_it it = contents.begin(), end = contents.end(); it != end; ++it)
+        add_pieces(*it, pieces);
+    for (pieces_it it = pieces.begin(), end = pieces.end(); it != end; ++it)
+        if (size_t trail = *it % align_pad)
+            *it += align_pad - trail;
+
+    if (in == out)
+    {
+        //rewind(out);
+        if (fseek(out, 0, SEEK_SET) == -1)
+        {
+            perror(name);
+            return;
+        }
+        if (ftruncate(fileno(out), 0) == -1)
+        {
+            perror(name);
+            return;
+        }
+    }
+    for (outer_cit oit = contents.begin(), oend = contents.end(); oit != oend; ++oit)
+    {
+        const std::vector<std::string>& inner = *oit;
+        size_t num_elems = inner.size();
+        // we have previously guaranteed that pieces[i].size() >= num_elems
+        for (size_t i = 0; i < num_elems; ++i)
+        {
+            // FIXME handle UTF-8 characters (here AND above?)
+            if (fputs(inner[i].c_str(), out) == -1)
+            {
+                perror(name);
+                return;
+            }
+            if (i != num_elems - 1)
+            {
+                if (fputc(',', out) == -1)
+                {
+                    perror(name);
+                    return;
+                }
+                size_t elem_length = inner[i].size() + 1;
+                while (elem_length++ < pieces[i])
+                {
+                    if (fputc(' ', out) == -1)
+                    {
+                        perror(name);
+                        return;
+                    }
+                }
+            }
+        }
+        fputc('\n', out);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    if (argc == 1)
+        aligncsv(stdin, stdout, "<stdio>");
+    for (int i = 1; i < argc; ++i)
+    {
+        FILE *f = fopen(argv[i], "r+");
+        if (!f)
+        {
+            perror(argv[i]);
+            continue;
+        }
+        aligncsv(f, f, argv[i]);
+        fclose(f);
+    }
+}
diff --git a/tools/aligncsv.py b/tools/aligncsv.py
deleted file mode 100755
index 54bcd9d4..00000000
--- a/tools/aligncsv.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/python
-# this formats a csv file to a serious whitespace intended format.
-
-import os
-import sys
-
-tabs=True
-additionalspaces = 5
-
-fname = sys.argv[1]
-if not os.path.exists(fname):
-	print "that file doesn't exist"
-	exit(0);
-
-f=open(fname,"r");
-lines=f.readlines()
-f.close();
-length=0
-
-for line in lines:
-	length=max(length, len(line.split(",")))
-
-print "# number of entries =",length
-
-
-#setup text array
-textarray=range(len(lines)+1)
-for x in range(len(lines)+1):
-	textarray[x] = range(length)
-
-for x in range(length):
-	textarray[-1][x] = 0
-
-#find the longest entry in each line in each position
-for lineno in range(len(lines)):
-	if not lines[lineno].strip().startswith("//") or lines[lineno].strip().startswith("//id"):
-		sp=lines[lineno].split(",")
-		for pieceno in range(len(sp)):
-			sp[pieceno] = sp[pieceno].strip() + "," #for the comma add a char
-			textarray[-1][pieceno] = max(len(sp[pieceno]),textarray[-1][pieceno])
-
-if tabs:
-	#make it divisable by 8 (tabs work then)
-	for pieceno in range(length):
-		if (textarray[-1][pieceno] %8) !=0:
-			textarray[-1][pieceno] = (((textarray[-1][pieceno])/8)*8)+8
-
-for lineno in range(len(lines)):
-	if not lines[lineno].strip().startswith("//") or lines[lineno].strip().startswith("//id"):
-		sp=lines[lineno].split(",")
-		for pieceno in range(length):
-			textarray[lineno][pieceno] = ""
-			if pieceno<len(sp):
-				sp[pieceno]= sp[pieceno].strip()
-				if pieceno<len(sp)-1:
-					sp[pieceno]= sp[pieceno] + ","
-
-				if (tabs):
-					n=(textarray[-1][pieceno]-len(sp[pieceno]))
-					textarray[lineno][pieceno] = sp[pieceno]
-					if (n%8) != 0:
-						textarray[lineno][pieceno] += "\t"*((n/8)+1)
-					else:
-						textarray[lineno][pieceno] += "\t"*((n/8))
-				else:
-					n=(textarray[-1][pieceno]-len(sp[pieceno])+additionalspaces)
-					textarray[lineno][pieceno] = " "*(n) + sp[pieceno]
-	else:
-		for pieceno in range(length):
-			textarray[lineno][pieceno] = ""
-		textarray[lineno][0]=lines[lineno].strip()
-
-
-fname = sys.argv[2]
-if not os.path.exists(fname):
-	print "that file doesn't exist"
-	exit(0);
-else:
-	f=open(fname,"w");
-	for line in textarray[:-1]:
-		for piece in line:
-			f.write(piece)
-		f.write("\n")
-	f.close()
-
-
-
-
-- 
cgit v1.2.3-70-g09d2