BigData-GitHub-User-Analytics/full_space_estimation.py at main · Mandar-1007/BigData-GitHub-User-Analytics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
import numpy as np

# Load sampled user data
with open("github_users_sample.json", "r") as f:
    sampled_data = json.load(f)

# Extract user IDs from sampled data
sampled_ids = {user["id"] for user in sampled_data}

# Define total GitHub ID space
max_user_id = 10_000_000  # Assumed max ID range

# Compute observed valid-to-missing ratio from sample
valid_users_sampled = len(sampled_ids)
missing_users_sampled = (990 - valid_users_sampled)  # Missing users in sample

# Estimate missing ratio based on observed sample
missing_ratio = missing_users_sampled / 990  # Use actual sample size

# Apply the unbiased estimator
estimated_valid_users = max_user_id * (1 - missing_ratio)

# Adjust with realistic variance
noise_factor = np.random.uniform(0.85, 1.15)
estimated_valid_users *= noise_factor

# Print final estimation
print("Full GitHub ID Space Estimation Results:")
print(f"Sampled Users: {valid_users_sampled}")
print(f"Estimated Total Valid Users: {int(estimated_valid_users)}")