Sunday, April 25, 2021

Lossy Counting Algorithm Python example

This is an implementation of the Lossy Counting Algorithm described in Manku and Motwani's 2002 paper "Approximate Frequency Counts over Data Streams".

It is an algorithm for estimate elements in a stream whose frequency count exceeds a threshold, while using only limited memory. For example for video view counts on something like YouTube, finding which videos that each constitute more than 3% of the views.

Please let me know if you spot any bugs.

from math import ceil
class LossyCount:
    def __init__(self, max_error=0.005):  # max_error is the parameter they call epsilon in the paper.
        self.max_error = max_error
        self.bucket_width = ceil(1/max_error)
        self.entries = dict()
        self.n = 0

    def put(self, x):
        self.n += 1
        current_bucket = ceil(self.n / self.bucket_width)

        freq, delta = 1, current_bucket-1
        if x in self.entries:
            freq, delta = self.entries[x]
            freq += 1
        self.entries[x] = (freq, delta)

        # If at bucket boundary then prune low frequency entries.
        if self.n % self.bucket_width == 0:
            prune = []
            for key in self.entries:
                freq, delta = self.entries[key]
                if freq + delta <= current_bucket:
                    prune.append(key)
            for key in prune:
                del self.entries[key]

    def get(self, support_threshold=0.001):  # support_threshold is the parameter they call s in the paper.
        res = []
        for key in self.entries:
            freq, delta = self.entries[key]
            if freq >= (support_threshold - self.max_error)*self.n:
                res.append(key)
        return res




# Generate test data.
from math import log
from random import random, randint
view_cnt = 500000
videos_cnt = 100000
x = [random() for _ in range(view_cnt)]
# y = [1/v for v in x]  # This distribution is too steep...
y = [(1/v)*0.01 -log(v) for v in x]  # A distribution that is reasonably steep and has a very long tail.
m = max(y)
y = [v/m for v in y]
# ids = [i for i in range(videos_cnt)]  # Easy to read IDs, but unrealistic. Most popular video will have ID 0, second most popular ID 1, etc.
ids = [randint(1000000, 9000000) for _ in range(videos_cnt)]  # More realistic video IDs.
idxs = [int(v*(videos_cnt-1)) for v in y]
views = [ids[v] for v in idxs]

# import matplotlib.pyplot as plt
# plt.hist(views, bins=200)  # Only works when the IDs are 1,2,3,4...
# plt.show()

threshold = 0.03  # We are interested in videos that each constitute more than 3% of the views.

# Generate exact results using a counter, to compare with.
from collections import Counter
c = Counter(views)
r = []
for k in c:
    r.append((c[k], k))
r2 = []
for cnt, id in r:
    if cnt >= view_cnt*threshold:
        r2.append(id)
print(sorted(r2))

# Test the LossyCount class. Should give similar (but not exact) results to the above.
lc = LossyCount()
for v in views:
    lc.put(v)
print(sorted(lc.get(threshold)))

No comments:

Post a Comment