Search a large sorted text file in Python

homeblogtwitterthingiverse



This is a very simple way to deal with a large static database:

class Searcher:
    def __init__(self, filename):
        self.f = open(filename, 'rb')
        self.f.seek(0,2)
        self.length = self.f.tell()
        
    def find(self, string):
        low = 0
        high = self.length
        while low < high:
            mid = (low+high)//2
            p = mid
            while p >= 0:
                self.f.seek(p)
                if self.f.read(1) == '\n': break
                p -= 1
            if p < 0: self.f.seek(0)
            line = self.f.readline()
            print '--', mid, line
            if line < string:
                low = mid+1
            else:
                high = mid
        
        p = low
        while p >= 0:
            self.f.seek(p)
            if self.f.read(1) == '\n': break
            p -= 1
        if p < 0: self.f.seek(0)
        
        result = [ ]    
        while True:
            line = self.f.readline()
            if not line or not line.startswith(string): break
            if line[-1:] == '\n': line = line[:-1]
            result.append(line[len(string):])
        return result

This class can be used to quckly find all lines in a sorted file that start with a given string. Pass find a string containing <table identifier><key><delimiter> and it will return all associated values.


Update 6/2/2009: Thanks Paul Randall for pointing out a bug that prevented this finding the first item in the file. Now fixed.




[æ]