This is an implementation of the Lossy Counting Algorithm described in Manku and Motwani's 2002 paper "Approximate Frequency Counts over Data Streams".
It is an algorithm for estimate elements in a stream whose frequency count exceeds a threshold, while using only limited memory. For example for video view counts on something like YouTube, finding which videos that each constitute more than 3% of the views.
Please let me know if you spot any bugs.
- from math import ceil
- class LossyCount:
- def __init__(self, max_error=0.005): # max_error is the parameter they call epsilon in the paper.
- self.max_error = max_error
- self.bucket_width = ceil(1/max_error)
- self.entries = dict()
- self.n = 0
- def put(self, x):
- self.n += 1
- current_bucket = ceil(self.n / self.bucket_width)
- freq, delta = 1, current_bucket-1
- if x in self.entries:
- freq, delta = self.entries[x]
- freq += 1
- self.entries[x] = (freq, delta)
- # If at bucket boundary then prune low frequency entries.
- if self.n % self.bucket_width == 0:
- prune = []
- for key in self.entries:
- freq, delta = self.entries[key]
- if freq + delta <= current_bucket:
- prune.append(key)
- for key in prune:
- del self.entries[key]
- def get(self, support_threshold=0.001): # support_threshold is the parameter they call s in the paper.
- res = []
- for key in self.entries:
- freq, delta = self.entries[key]
- if freq >= (support_threshold - self.max_error)*self.n:
- res.append(key)
- return res
- # Generate test data.
- from math import log
- from random import random, randint
- view_cnt = 500000
- videos_cnt = 100000
- x = [random() for _ in range(view_cnt)]
- # y = [1/v for v in x] # This distribution is too steep...
- y = [(1/v)*0.01 -log(v) for v in x] # A distribution that is reasonably steep and has a very long tail.
- m = max(y)
- y = [v/m for v in y]
- # ids = [i for i in range(videos_cnt)] # Easy to read IDs, but unrealistic. Most popular video will have ID 0, second most popular ID 1, etc.
- ids = [randint(1000000, 9000000) for _ in range(videos_cnt)] # More realistic video IDs.
- idxs = [int(v*(videos_cnt-1)) for v in y]
- views = [ids[v] for v in idxs]
- # import matplotlib.pyplot as plt
- # plt.hist(views, bins=200) # Only works when the IDs are 1,2,3,4...
- # plt.show()
- threshold = 0.03 # We are interested in videos that each constitute more than 3% of the views.
- # Generate exact results using a counter, to compare with.
- from collections import Counter
- c = Counter(views)
- r = []
- for k in c:
- r.append((c[k], k))
- r2 = []
- for cnt, id in r:
- if cnt >= view_cnt*threshold:
- r2.append(id)
- print(sorted(r2))
- # Test the LossyCount class. Should give similar (but not exact) results to the above.
- lc = LossyCount()
- for v in views:
- lc.put(v)
- print(sorted(lc.get(threshold)))
No comments:
Post a Comment