This is an implementation of the Lossy Counting Algorithm described in Manku and Motwani's 2002 paper "Approximate Frequency Counts over Data Streams".

It is an algorithm for estimate elements in a stream whose frequency count exceeds a threshold, while using only limited memory. For example for video view counts on something like YouTube, finding which videos that each constitute more than 3% of the views.

Please let me know if you spot any bugs.

from math import ceil class LossyCount: def __init__(self, max_error=0.005): # max_error is the parameter they call epsilon in the paper. self.max_error = max_error self.bucket_width = ceil(1/max_error) self.entries = dict() self.n = 0 def put(self, x): self.n += 1 current_bucket = ceil(self.n / self.bucket_width) freq, delta = 1, current_bucket-1 if x in self.entries: freq, delta = self.entries[x] freq += 1 self.entries[x] = (freq, delta) # If at bucket boundary then prune low frequency entries. if self.n % self.bucket_width == 0: prune = [] for key in self.entries: freq, delta = self.entries[key] if freq + delta <= current_bucket: prune.append(key) for key in prune: del self.entries[key] def get(self, support_threshold=0.001): # support_threshold is the parameter they call s in the paper. res = [] for key in self.entries: freq, delta = self.entries[key] if freq >= (support_threshold - self.max_error)*self.n: res.append(key) return res # Generate test data. from math import log from random import random, randint view_cnt = 500000 videos_cnt = 100000 x = [random() for _ in range(view_cnt)] # y = [1/v for v in x] # This distribution is too steep... y = [(1/v)*0.01 -log(v) for v in x] # A distribution that is reasonably steep and has a very long tail. m = max(y) y = [v/m for v in y] # ids = [i for i in range(videos_cnt)] # Easy to read IDs, but unrealistic. Most popular video will have ID 0, second most popular ID 1, etc. ids = [randint(1000000, 9000000) for _ in range(videos_cnt)] # More realistic video IDs. idxs = [int(v*(videos_cnt-1)) for v in y] views = [ids[v] for v in idxs] # import matplotlib.pyplot as plt # plt.hist(views, bins=200) # Only works when the IDs are 1,2,3,4... # plt.show() threshold = 0.03 # We are interested in videos that each constitute more than 3% of the views. # Generate exact results using a counter, to compare with. from collections import Counter c = Counter(views) r = [] for k in c: r.append((c[k], k)) r2 = [] for cnt, id in r: if cnt >= view_cnt*threshold: r2.append(id) print(sorted(r2)) # Test the LossyCount class. Should give similar (but not exact) results to the above. lc = LossyCount() for v in views: lc.put(v) print(sorted(lc.get(threshold)))

## No comments:

## Post a Comment