aboutsummaryrefslogtreecommitdiff
path: root/src/main
diff options
context:
space:
mode:
authorEadrom <Eadrom@users.noreply.github.com>2024-01-14 06:31:46 -0700
committerGitHub <noreply@github.com>2024-01-14 14:31:46 +0100
commite4f0891d2dddff9461945cc83fe36b36c26dba4a (patch)
tree2091e1f1ff3e1f32a583952216394fd3fdae4fdd /src/main
parent38f8b8d544b7e636c25a29f53907838636f3d486 (diff)
added python script to build test data (#366)
* added python script to build test data * moved create_measurements.py to src/main/python and updated paths for file io * Updated readme to include blurb about python script to generate measurements
Diffstat (limited to 'src/main')
-rwxr-xr-xsrc/main/python/create_measurements.py143
1 files changed, 143 insertions, 0 deletions
diff --git a/src/main/python/create_measurements.py b/src/main/python/create_measurements.py
new file mode 100755
index 0000000..f48972a
--- /dev/null
+++ b/src/main/python/create_measurements.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+# Based on https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CreateMeasurements.java
+
+import os
+import sys
+import random
+import time
+
+
+def check_args(file_args):
+ """
+ Sanity checks out input and prints out usage if input is not a positive integer
+ """
+ try:
+ if len(file_args) != 2 or int(file_args[1]) <= 0:
+ raise Exception()
+ except:
+ print("Usage: create_measurements.sh <positive integer number of records to create>")
+ print(" You can use underscore notation for large number of records.")
+ print(" For example: 1_000_000_000 for one billion")
+ exit()
+
+
+def build_weather_station_name_list():
+ """
+ Grabs the weather station names from example data provided in repo and dedups
+ """
+ station_names = []
+ with open('../../../data/weather_stations.csv', 'r') as file:
+ file_contents = file.read()
+ for station in file_contents.splitlines():
+ if "#" in station:
+ next
+ else:
+ station_names.append(station.split(';')[0])
+ return list(set(station_names))
+
+
+def convert_bytes(num):
+ """
+ Convert bytes to a human-readable format (e.g., KiB, MiB, GiB)
+ """
+ for x in ['bytes', 'KiB', 'MiB', 'GiB']:
+ if num < 1024.0:
+ return "%3.1f %s" % (num, x)
+ num /= 1024.0
+
+
+def format_elapsed_time(seconds):
+ """
+ Format elapsed time in a human-readable format
+ """
+ if seconds < 60:
+ return f"{seconds:.3f} seconds"
+ elif seconds < 3600:
+ minutes, seconds = divmod(seconds, 60)
+ return f"{int(minutes)} minutes {int(seconds)} seconds"
+ else:
+ hours, remainder = divmod(seconds, 3600)
+ minutes, seconds = divmod(remainder, 60)
+ if minutes == 0:
+ return f"{int(hours)} hours {int(seconds)} seconds"
+ else:
+ return f"{int(hours)} hours {int(minutes)} minutes {int(seconds)} seconds"
+
+
+def estimate_file_size(weather_station_names, num_rows_to_create):
+ """
+ Tries to estimate how large a file the test data will be
+ """
+ max_string = float('-inf')
+ min_string = float('inf')
+ per_record_size = 0
+ record_size_unit = "bytes"
+
+ for station in weather_station_names:
+ if len(station) > max_string:
+ max_string = len(station)
+ if len(station) < min_string:
+ min_string = len(station)
+ per_record_size = ((max_string + min_string * 2) + len(",-123.4")) / 2
+
+ total_file_size = num_rows_to_create * per_record_size
+ human_file_size = convert_bytes(total_file_size)
+
+ return f"Estimated max file size is: {human_file_size}.\nTrue size is probably much smaller (around half)."
+
+
+def build_test_data(weather_station_names, num_rows_to_create):
+ """
+ Generates and writes to file the requested length of test data
+ """
+ start_time = time.time()
+ coldest_temp = -99.9
+ hottest_temp = 99.9
+ station_names_10k_max = random.choices(weather_station_names, k=10_000)
+ progress_step = max(1, int(num_rows_to_create / 100))
+ print('Building test data...')
+
+ try:
+ with open("../../../data/measurements.txt", 'w') as file:
+ for s in range(0,num_rows_to_create):
+ random_station = random.choice(station_names_10k_max)
+ random_temp = round(random.uniform(coldest_temp, hottest_temp), 1)
+ file.write(f"{random_station};{random_temp}\n")
+ # Update progress bar every 1%
+ if s % progress_step == 0 or s == num_rows_to_create - 1:
+ sys.stdout.write('\r')
+ sys.stdout.write("[%-50s] %d%%" % ('=' * int((s + 1) / num_rows_to_create * 50), (s + 1) / num_rows_to_create * 100))
+ sys.stdout.flush()
+ sys.stdout.write('\n')
+ except Exception as e:
+ print("Something went wrong. Printing error info and exiting...")
+ print(e)
+ exit()
+
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ file_size = os.path.getsize("../../../data/measurements.txt")
+ human_file_size = convert_bytes(file_size)
+
+ print("Test data successfully written to 1brc/data/measurements.txt")
+ print(f"Actual file size: {human_file_size}")
+ print(f"Elapsed time: {format_elapsed_time(elapsed_time)}")
+
+
+def main():
+ """
+ main program function
+ """
+ check_args(sys.argv)
+ num_rows_to_create = int(sys.argv[1])
+ weather_station_names = []
+ weather_station_names = build_weather_station_name_list()
+ print(estimate_file_size(weather_station_names, num_rows_to_create))
+ build_test_data(weather_station_names, num_rows_to_create)
+ print("Test data build complete.")
+
+
+if __name__ == "__main__":
+ main()
+exit()