Sat, 22 Oct 2005

#!/usr/bin/python
# Compute "lexical signatures" for each paragraph in a text document

# The idea is to compress these lossily into "queer numbers" to be
# used as fragment IDs.  See my "queer numbers" post on kragen-tol.

# This Python program depends on generator expressions, which were
# introduced in Python 2.4.  You can wrap the generator expressions in
# [] to turn them into list comprehensions to make them run in older
# Pythons.

import re, sys

def paragraphs(afile):
    "Iterate over the paragraphs in a text file."
    current_paragraph = ''
    for line in afile:
        if line.strip() == '':
            if current_paragraph != '': yield current_paragraph
            current_paragraph = ''
        else:
            current_paragraph += line
    if current_paragraph != '': yield current_paragraph

def words(astring):
    "Iterate over the words in a string, such as a paragraph."
    for word in re.findall("['\w]+", astring): yield word.lower()

def freq(aseq):
    "Compute a frequency table of each unique element in a sequence."
    rv = {}
    for item in aseq:
        rv.setdefault(item, 0)
        rv[item] += 1
    return rv

def mergefreqs(freqs):
    "Given a sequence of frequency tables, compute a summary table."
    rv = {}
    for freq in freqs:
        for key, value in freq.iteritems():
            rv.setdefault(key, 0)
            rv[key] += value
    return rv

def nonuniques(freqs):
    "Cull a frequency table to remove things that occur only once."
    return dict((k, v) for k, v in freqs.iteritems() if v > 1)

def count(freq):
    "Given a frequency table, return the length of the sequence it represents."
    return float(sum(freq.itervalues()))

def tfidf(freqs):
    """Compute TF/IDF over a sequence of frequency tables.

    I'm not sure this is actually TF/IDF, because I'm not sure I know
    what TF/IDF is.
    
    """
    freqs = list(freqs)
    totals = mergefreqs(freqs)
    total_terms = count(totals)
    for freq in freqs:
        rv = {}
        terms = count(freq)
        for word, n in freq.iteritems():
            tf = n / terms
            idf = totals[word] / total_terms
            rv[word] = tf/idf
        yield rv

def sigs(freqs):
    """Compute 'lexical signatures' in the form of ordered word lists,
    given a sequence of frequency tables to start with.

    """
    for thingy in tfidf(freqs):
        words = thingy.keys()
        words.sort(key = thingy.get, reverse = True)
        yield words

def display_sigs(infile):
    """Print a text summary of lexical signatures of some file."""
    freqs = list(freq(words(para)) for para in paragraphs(infile))
    hmm = nonuniques(mergefreqs(freqs))
    ii = sigs(freqs)
    max = 10
    for line in ii:
        print ' '.join(line[:max])
        print '->', ' '.join([word for word in line
                              if hmm.has_key(word)][:max])

if __name__ == '__main__': display_sigs(file(sys.argv[1]))