Tuesday, April 27, 2021

Geohash Python example

Here's a simple Geohash encode/decode implementation. The decoding function pretty closely follows the technique layed out on the Geohash Wikipedia entry.

from fractions import Fraction

"""
First decode the special geohash variant of base32 encoding.
Each encoded digit (0-9-b..z) (not continuous abc) is a 5 bit val 0,1,2...,30,31.
In the resulting bitstream, every second bit is now for latitude and longtitude.
Initially the latitutde range is -90,+90.
When a latitude bit is 1, then it now starts at the mid of these.
Else if 0 it now ends at the mid of these.
Same for longtitude but with range -180,+180.
"""
def decode_geohash(s):
    alphabet_32ghs = "0123456789bcdefghjkmnpqrstuvwxyz"
    dec_from_32ghs = dict()
    for i, c in enumerate(alphabet_32ghs):
        dec_from_32ghs[c] = i

    bits = 0  # Integer representation of hash.
    bit_cnt = 0
    for c in s:
        bits = (bits << 5) | dec_from_32ghs[c]
        bit_cnt += 5

    # Every second bit is longtitude and latitude. Digits in even positions are latitude.
    lat_bits, lon_bits = 0, 0
    lat_bit_cnt = bit_cnt // 2
    lon_bit_cnt = lat_bit_cnt
    if bit_cnt % 2 == 1:
        lon_bit_cnt += 1

    for i in range(bit_cnt):
        cur_bit_pos = bit_cnt - i
        cur_bit = (bits & (1 << cur_bit_pos)) >> cur_bit_pos
        if i % 2 == 0:
            lat_bits |= cur_bit << (cur_bit_pos//2)
        else:
            lon_bits |= cur_bit << (cur_bit_pos//2)

    lat_start, lat_end = Fraction(-90), Fraction(90)
    for cur_bit_pos in range(lat_bit_cnt-1, -1, -1):
        mid = (lat_start + lat_end) / 2
        if lat_bits & (1 << cur_bit_pos):
            lat_start = mid
        else:
            lat_end = mid

    lon_start, lon_end = Fraction(-180), Fraction(180)
    for cur_bit_pos in range(lon_bit_cnt-1, -1, -1):
        mid = (lon_start + lon_end) / 2
        if lon_bits & (1 << cur_bit_pos):
            lon_start = mid
        else:
            lon_end = mid

    return float(lat_start), float(lat_end), float(lon_start), float(lon_end)


# Inspired by https://www.factual.com/blog/how-geohashes-work/
def encode_geohash(lat, lon, bit_cnt):
    if bit_cnt % 5 != 0:
        raise ValueError("bit_cnt must be divisible by 5")

    bits = 0
    lat_start, lat_end = Fraction(-90), Fraction(90)
    lon_start, lon_end = Fraction(-180), Fraction(180)
    for i in range(bit_cnt):
        if i % 2 == 0:
            mid = (lon_start + lon_end) / 2
            if lon < mid:
                bits = (bits << 1) | 0
                lon_end = mid
            else:
                bits = (bits << 1) | 1
                lon_start = mid
        else:
            mid = (lat_start + lat_end) / 2
            if lat < mid:
                bits = (bits << 1) | 0
                lat_end = mid
            else:
                bits = (bits << 1) | 1
                lat_start = mid

    print("bits: {:>b}".format(bits))

    # Do the special geohash base32 encoding.
    s = ""
    alphabet_32ghs = "0123456789bcdefghjkmnpqrstuvwxyz"
    for i in range(bit_cnt // 5):
        idx = (bits >> i*5) & (1 | 2 | 4 | 8 | 16)
        s += alphabet_32ghs[idx]
    return s[::-1]


print(decode_geohash("ezs42"))
print(decode_geohash("9q8y"))
print(encode_geohash(37.7, -122.5, 20))

Sunday, April 25, 2021

Lossy Counting Algorithm Python example

This is an implementation of the Lossy Counting Algorithm described in Manku and Motwani's 2002 paper "Approximate Frequency Counts over Data Streams".

It is an algorithm for estimate elements in a stream whose frequency count exceeds a threshold, while using only limited memory. For example for video view counts on something like YouTube, finding which videos that each constitute more than 3% of the views.

Please let me know if you spot any bugs.

from math import ceil
class LossyCount:
    def __init__(self, max_error=0.005):  # max_error is the parameter they call epsilon in the paper.
        self.max_error = max_error
        self.bucket_width = ceil(1/max_error)
        self.entries = dict()
        self.n = 0

    def put(self, x):
        self.n += 1
        current_bucket = ceil(self.n / self.bucket_width)

        freq, delta = 1, current_bucket-1
        if x in self.entries:
            freq, delta = self.entries[x]
            freq += 1
        self.entries[x] = (freq, delta)

        # If at bucket boundary then prune low frequency entries.
        if self.n % self.bucket_width == 0:
            prune = []
            for key in self.entries:
                freq, delta = self.entries[key]
                if freq + delta <= current_bucket:
                    prune.append(key)
            for key in prune:
                del self.entries[key]

    def get(self, support_threshold=0.001):  # support_threshold is the parameter they call s in the paper.
        res = []
        for key in self.entries:
            freq, delta = self.entries[key]
            if freq >= (support_threshold - self.max_error)*self.n:
                res.append(key)
        return res




# Generate test data.
from math import log
from random import random, randint
view_cnt = 500000
videos_cnt = 100000
x = [random() for _ in range(view_cnt)]
# y = [1/v for v in x]  # This distribution is too steep...
y = [(1/v)*0.01 -log(v) for v in x]  # A distribution that is reasonably steep and has a very long tail.
m = max(y)
y = [v/m for v in y]
# ids = [i for i in range(videos_cnt)]  # Easy to read IDs, but unrealistic. Most popular video will have ID 0, second most popular ID 1, etc.
ids = [randint(1000000, 9000000) for _ in range(videos_cnt)]  # More realistic video IDs.
idxs = [int(v*(videos_cnt-1)) for v in y]
views = [ids[v] for v in idxs]

# import matplotlib.pyplot as plt
# plt.hist(views, bins=200)  # Only works when the IDs are 1,2,3,4...
# plt.show()

threshold = 0.03  # We are interested in videos that each constitute more than 3% of the views.

# Generate exact results using a counter, to compare with.
from collections import Counter
c = Counter(views)
r = []
for k in c:
    r.append((c[k], k))
r2 = []
for cnt, id in r:
    if cnt >= view_cnt*threshold:
        r2.append(id)
print(sorted(r2))

# Test the LossyCount class. Should give similar (but not exact) results to the above.
lc = LossyCount()
for v in views:
    lc.put(v)
print(sorted(lc.get(threshold)))