Sat, 05 Jul 2008

#!/usr/bin/python
import re, sys, StringIO
"""Enumerate sentences of a context-free grammar.

Some sample, and kind of creepy, output:

Python writes this.
Python writes Python.
Python writes code.
Python writes you.
Python writes me.
...
the Python is code.
the Python is you.
the Python is me.
the code writes this.
...
6 lines are Python.
6 lines are code.
6 lines are you.
6 lines are me.
the generators are this.
the generators are Python.
...
by contrast, this is code.
by contrast, this is you.
by contrast, this is me.
the Python writes 6 generators.
the Python writes 6 lines.
the Python writes the generators.
the Python writes the lines.
...
by contrast, the lines are Python.
by contrast, the lines are code.
by contrast, the lines are you.
by contrast, the lines are me.
...
6 generators let you think me.
6 generators let me write this.
6 generators let me write Python.
...
the lines let Python think this.
the lines let Python think Python.
...
by contrast, the code writes the generators.
by contrast, the code writes the lines.
by contrast, the code is 6 generators.
by contrast, the code is 6 lines.
...
Python writes the Python code generators.
Python writes the Python code lines.
...
code writes 6 lines of code.
code writes 6 lines of you.
code writes 6 lines of me.
code writes the generators of this.
code writes the generators of Python.
code writes the generators of code.
code writes the generators of you.
code writes the generators of me.
"""

default_cfg = '''
sentence "."

sentence:
        singular-subject " " singular-predicate
        plural-subject " " plural-predicate
        sentence-modifier ", " sentence
sentence-modifier:
        "by contrast"
plural-subject:
        plural-noun
singular-subject:
        mass-noun
        singular-subject-pronoun
        "the " mass-noun
singular-subject-pronoun:
        "that"
        "this"
mass-noun:
        "Python"
        "code"
plural-noun:
        number " " simple-plural-noun
        plural-article " " simple-plural-noun
        plural-noun " " prepositional-phrase
plural-article:
        "the"
singular-article:
        "a"
        plural-article
prepositional-phrase:
        preposition " " object
preposition:
        "of"
simple-plural-noun:
        mass-noun " " simple-plural-noun
        "generators"
        "lines"
number:
        "6"
plural-predicate:
        plural-transitive-verb-phrase " " object
plural-transitive-verb-phrase:
        "let " object " " infinitive
        "are"
infinitive:
        "write"
        "think"
object:
        "this"
        plural-noun
        mass-noun
        "you"
        "me"
singular-predicate:
        singular-transitive-verb-phrase " " object
singular-transitive-verb-phrase:
        "writes"
        "is"
'''


class Literal:
    "A literal string in the grammar."
    def __init__(self, contents):
        self.contents = contents.replace(r"\n", "\n")
    def alternatives(self, nwords, grammar):
        if nwords == 1:
            yield self.contents

class Nonterminal:
    "A nonterminal in the grammar; expands into some phrase."
    def __init__(self, name):
        self.name = name
    def alternatives(self, nwords, grammar):
        for production in grammar[self.name]:
            for alternative in production.alternatives(nwords, grammar):
                yield alternative

class Concatenation:
    "A concatenation of two literals, nonterminals, and concatenations."
    def __init__(self, left, right):
        self.left, self.right = left, right
    def alternatives(self, nwords, grammar):
        for ii in range(1, nwords):
            for left_alt in self.left.alternatives(ii, grammar):
                for right_alt in self.right.alternatives(nwords - ii,
                                                         grammar):
                    yield left_alt + right_alt

def concatenation(items):
    "Produce a concatenation of an arbitrary nonzero number of terms."
    assert len(items) > 0
    if len(items) == 1: return items[0]
    elif len(items) == 2: return Concatenation(items[0], items[1])
    else: return Concatenation(items[0], concatenation(items[1:]))

def parse_production(line):
    "Turn a production line into a Concatenation, Nonterminal, or Literal"
    tokens = re.findall(r'\s*([-\w]+|"(?:[^"\\]|\\.)+")', line)
    rv = []
    for tok in tokens:
        if tok.startswith('"'):
            rv.append(Literal(tok[1:-1]))
        else:
            rv.append(Nonterminal(tok))
    return concatenation(rv)

class CouldntParse(Exception): pass

class Grammar:
    "A container for the productions and the first rule."
    def __init__(self, productions, first_rule):
        self.first_rule = first_rule
        self.productions = productions
    def __getitem__(self, x):
        return self.productions[x]
    def alternatives(self, nwords):
        return self.first_rule.alternatives(nwords, self)

def parse_grammar(infile):
    current_rule = None
    first_rule = None
    productions = {}
    for line in infile:
        if line.startswith(" "):
            productions[current_rule].append(parse_production(line))
        elif line.endswith(":\n"):
            current_rule = line[:-2]
            productions[current_rule] = []
        elif re.match(r"\s*$", line) or re.match(r'#', line):
            pass
        elif first_rule is None:
            first_rule = parse_production(line)
        else:
            raise CouldntParse(line)
    return Grammar(productions, first_rule)

if __name__ == '__main__':
    if len(sys.argv) == 1:
        infile = StringIO.StringIO(default_cfg)
    else:
        infile = file(sys.argv[1])
    grammar = parse_grammar(infile)
    ii = 0
    while True:
        for sentence in grammar.alternatives(ii):
            print sentence
        ii += 1


Sat, 28 Jun 2008

This is the kind of thing I wish everybody could write for themselves.
It automated a simple, repetitive task that Beatrice was spending a lot
of time on.  I wrote most of it in a few minutes, and then she finished
it.

#!/usr/bin/perl -w
use strict;
# script to help beatrice with her web pages
# comments for perl newbies

# ARGLEBARGLE marks the end of $stylelinks
# "my" creates a new variable
my $stylelinks = <<ARGLEBARGLE;
     <link href="../css/global.css" rel="stylesheet" type="text/css">
     <link rel="shortcut icon" href="../../paisley.ico">
ARGLEBARGLE

# similarly with SIMILARLY
my $sidebar = <<SIMILARLY;
<ul class="first"> 
	<li><a href="../animals/index.html"><h2 localizable="true">Animals</h2></a></li>
	<li><a href="../architecture/index.html"><h2 localizable="true">Architecture</h2></a></li>
	<li><a href="../light/index.html"><h2 localizable="true">Light</h2></a></li>
	<li><a href="../macros/index.html"><h2 localizable="true">Macros</h2></a></li>
	<li><a href="../performances/index.html"><h2 localizable="true">Performances</h2></a></li>
	<li><a href="../plants/index.html"><h2 localizable="true">Plants</h2></a></li>
	<li><a href="../../index.html"><h2 localizable="true">Home</h2></a></li>
SIMILARLY

sub mogrify_file {
  my ($input_filename, $output_file) = @_;
  # creates a variable $file, opens the file for input (<), and sticks
  # the open file in $file
  open my $file, "<", $input_filename or die "Can't open $input_filename: $!";

  # <$file> reads a line from the open file and sticks it in $_
  while (<$file>) {
    if (/<h2/) {              # /foo/ looks for "foo" in $_, returns true if found
      print $output_file $sidebar;
    } elsif (/rel="stylesheet"/) {
      print $output_file $stylelinks;
    } elsif (/CC-Attribution/) {
      print $output_file ' <p localizable="true"><a href="http://creativecommons.org/licenses/by-sa/3.0/">CC-Attribution 3.0</a> Beatrice Murch</p>';
    } else {
      print $output_file $_;
    }
  }
}

sub dirname {
  my ($filename) = @_;
  # chop off slashes followed by zero or more (*) nonslashes ([^/]) at
  # the end ($)
  $filename =~ s|/[^/]*$||;
  return $filename;
}

die unless (dirname "fixed/foo/bar/baz.html") eq "fixed/foo/bar";

sub ensure_dir_exists {
  my ($filename) = @_;
  my $dirname = dirname($filename);
  # invoke shell command mkdir -p fixed/foo/bar
  system "mkdir", "-p", $dirname;
}

sub create_new_file_from {
  my ($filename) = @_;
  my $newfilename = "fixed/$filename";
  ensure_dir_exists($newfilename);
  open my $output, ">", $newfilename or die "Can't open $newfilename: $!";
  mogrify_file($filename, $output);
}

die "no files" unless @ARGV;
for my $file (@ARGV) {
  create_new_file_from($file);
}

Sat, 21 Jun 2008

I wanted to experiment with having one mail message per file, but I'm
sad to report that, on my kernel, ext3fs becomes unusably slow with
random access to hundreds of thousands of files, even with
`dir_index`, even if the branching factor is only 100 files per
directory.  (I'm using Linux 2.6.18-6-686 from Debian Etch with
dm-crypt filesystem encryption on a year-1999 700MHz PIII laptop.
Maybe it wouldn't be a problem with a real computer, or a more recent
kernel.)

An earlier version of this code was responsible for the directory with
800 000 files that I mentioned in a previous kragen-hacks post which
included a shell script to remove that directory.  (I had given up on
`rm -rf` after 8 hours.)  This version doesn't stress the filesystem
quite as badly.

What I had in mind was to try checking my mailbox into Git, to get
better compression, faster syncing, and automatic detection and
correction of data corruption.  The 1.4 version of Git I have doesn't
seem to perform reasonably on the task, but maybe that's because the
underlying filesystem is sucking rocks.

I probably ought to try running this on murdererfs and see if it
performs better; after all, this is the kind of thing it's made for,
right?

Like everything else posted to kragen-hacks without any notice to the
contrary, this program is in the public domain; I abandon any
copyright in it.

#!/usr/bin/python
import sys, os

def makedirs(name):
    try: os.makedirs(name)
    except OSError, e:
        if e.errno == 17: pass
        else: raise

class Output:
    def __init__(self, dirname):
        self.dirname = dirname
        self.counter = 0
        self.go_to_new_file()
    def go_to_new_file(self):
        self.counter += 1
        dirname = '%s/%02d/%04d' % (self.dirname,
                                    self.counter % 100,
                                    self.counter % 10000)
        filename = '%s/message-%s' % (dirname, self.counter)
        makedirs(dirname)
        self.fo = file(filename, 'w')
    def write(self, data): self.fo.write(data)
    def close(self): self.fo.close()

def split(mbox, output):
    for line in mbox:
        if line.startswith('From '):
            output.go_to_new_file()
        output.write(line)
    output.close()

if __name__ == '__main__':
    split(sys.stdin, Output(sys.argv[1]))

Sat, 14 Jun 2008

All the timing notes here are from my 700MHz laptop with filesystem
encryption.

I did this in Perl because Python doesn't have a `readdir` that lets
you iterate over the files in the directory one at a time; it only has
a `listdir` that constructs a list of all of them in memory and
returns it.  Maybe that would have been fine.

Like everything else posted to kragen-hacks without any notice to the
contrary, this program is in the public domain; I abandon any
copyright in it.

# wrote this to clean up a directory with 500 000 files in it.  After
# 8 hours of rm -rf, rm had only cut it down from 800 000 to 500 000
# files.

# without the unlink, this took only 99 seconds to run over all 
# 500 000 files.

# with unlink, it got through 18479 files in 3m2.7s, so that's
# actually 100 files per second, so it should be done after 5000
# seconds.

# A second run got through 60893 files in 5m23s, or 323s, which is 188
# files per second.  This isn't reassuring that I'm measuring
# performance accurately but at least I know there's no substantial
# N^2 term.

# third run: 234502 files in 20m45s.  Again 188 files per second.  But
# then the next time around, it took 80 seconds without successfully
# readdirring anything.

time perl -e 'opendir MB, "mboxtmp" or die; 
     while (my $x = readdir MB) { print "$x\n" }
' | perl -ne '$| = 1; 
              chomp; 
              unlink "mboxtmp/$_" or warn "$_: $!";
              print "$.\r"'

time perl -e 'opendir MB, "mboxtmp.2" or die; 
     while (my $x = readdir MB) { print "$x\n" }
' | perl -ne '$| = 1; 
              chomp; 
              unlink "mboxtmp.2/$_" or warn "$_: $!";
              print "$.\r"'

Sat, 07 Jun 2008

Since working on this new project where we're using
<http://jottit.com/> for our to-do list, I've become enamored of
Markdown.  So I wrote this script to allow me to write documents
originally in Markdown and then generate HTML versions.

This works particularly well with Emacs `longlines-mode`.  I wrote the
first draft of my new company's "Acta Constitutiva" (i.e. charter and
bylaws) in it.  With CSS and with `M-x recompile` bound to the F5 key,
and `compile-command` set to `(cd ~/distributed-expertise;
~/devel/mkhtml.py bylaws; iceweasel bylaws.html)`, it was a pretty
reasonable word-processing experience, preferable to OpenOffice Write
for the following reasons:

- On my 384MB 700MHz laptop, OpenOffice is painfully slow; Emacs
  screams.
- My Emacs has an input method that handles Spanish reasonably;
  OpenOffice might, but I haven't been able to find it.
- I could edit in HTML and CSS in the cases where I wanted it, and
  pretend I was just editing a normal text file the rest of the time,
  with all of the normal Emacs amenities.  Except with
  word-processor-style word wrap instead of this M-q crap.
- Stylesheeting comes naturally.  I just put a `<style>` element at
  the top with a few lines inside of it to format nicely.
- I can see more of the document at a time in Emacs.

Like everything else posted to kragen-hacks without any notice to the
contrary, this program is in the public domain; I abandon any
copyright in it.

#!/usr/bin/python
"""Turn Markdown documents into HTML documents.

Depends on python-markdown and Beautiful Soup.

Markdown normally generates HTML document content; this generates HTML
documents instead.

"""
import markdown, BeautifulSoup, sys, os, os.path

def render(text):
    "Given Markdown input as a string, produce an HTML document as a string."
    body = str(markdown.Markdown(text))
    soup = BeautifulSoup.BeautifulSoup(body)

    headers = soup('h1')
    if len(headers) > 0:
        title = headers[0].renderContents()
    else:
        title = 'Lame document with no top-level header'

    return '''<html><head><title>%s</title>
    <meta http-equiv="content-type" content="text/html; charset=utf-8" />
    </head>
    <body>%s</body></html>''' % (title, body)

def process(infile):
    "Given a filename of Markdown input, create an HTML file as output."
    outfile = infile + '.html'

    if os.path.exists(outfile) and \
           os.stat(outfile).st_mtime > os.stat(infile).st_mtime:
        print "`%s` is newer than `%s`, skipping  " % (outfile, infile)
        return

    outfiletmp = outfile + '.tmp'
    fo = file(outfiletmp, 'w')
    fo.write(render(file(infile).read()))
    fo.close()

    os.rename(outfiletmp, outfile)  # atomic replace; won't work on Win32
    print "rendered `%s` to `%s`  " % (infile, outfile)

def main(args):
    filenames = args[1:]
    if filenames:
        for filename in filenames: process(filename)
        return 0
    else:
        print ("usage: `%s foo bar baz`; implicitly writes to `foo.html`, etc."
               % args[0])
        return 1

if __name__ == '__main__':
    sys.exit(main(sys.argv))