[96a5a0]: / src / utils / findMultiplication.py

Download this file

107 lines (88 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
"""
This just allows me to use quickly use a log to see how much I could parallelize the processing of the trials without going over the rate limit
not really meant for production, just a quick check, will be incorporated much better eventually. Used this to find out that about 100 workers is the perfect number to get around 75% use of rate limits
"""
import re
log_file_path = "../../logs/clinical_trial_analysis_20250312_050821.log"
def extract_cycles(field_prefix, encoding="latin-1"):
"""
Extract cycles for a given field_prefix (e.g. "tokens" or "requests").
Each cycle is determined by consecutive log lines where the reset value
decreases or stays the same. When it increases, we consider the previous cycle complete.
Returns a list of dictionaries with keys: timestamp, limit, remaining, reset, used, percent_used.
"""
# Compile regex patterns dynamically.
timestamp_pat = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")
limit_pat = re.compile(r"b'x-ratelimit-limit-" + field_prefix + r"', b'(\d+)'")
remaining_pat = re.compile(
r"b'x-ratelimit-remaining-" + field_prefix + r"', b'(\d+)'"
)
reset_pat = re.compile(r"b'x-ratelimit-reset-" + field_prefix + r"', b'(\d+)ms'")
records = []
with open(log_file_path, "r", encoding=encoding) as f:
for line in f:
if "x-ratelimit-reset-" + field_prefix in line:
ts_match = timestamp_pat.search(line)
ts = ts_match.group(1) if ts_match else None
limit_match = limit_pat.search(line)
remaining_match = remaining_pat.search(line)
reset_match = reset_pat.search(line)
if limit_match and remaining_match and reset_match:
record = {
"timestamp": ts,
"limit": int(limit_match.group(1)),
"remaining": int(remaining_match.group(1)),
"reset": int(reset_match.group(1)),
}
records.append(record)
# Group records into cycles.
cycles = []
if records:
current_cycle = [records[0]]
for rec in records[1:]:
# Within a cycle, the reset value should decrease (or remain the same).
# A jump up indicates the start of a new cycle.
if rec["reset"] <= current_cycle[-1]["reset"]:
current_cycle.append(rec)
else:
cycles.append(current_cycle[-1])
current_cycle = [rec]
if current_cycle:
cycles.append(current_cycle[-1])
# Calculate used and percent used for each cycle.
results = []
for cycle in cycles:
used = cycle["limit"] - cycle["remaining"]
percent_used = (used / cycle["limit"]) * 100 if cycle["limit"] else 0
results.append(
{
"timestamp": cycle["timestamp"],
"reset": cycle["reset"],
"remaining": cycle["remaining"],
"used": used,
"percent_used": percent_used,
}
)
return results
# Extract cycles for tokens and requests.
token_cycles = extract_cycles("tokens")
request_cycles = extract_cycles("requests")
# Function to compute averages.
def compute_averages(cycles, field_name):
if not cycles:
return f"No {field_name} data found."
total_used = sum(cycle["used"] for cycle in cycles)
total_percent = sum(cycle["percent_used"] for cycle in cycles)
avg_used = total_used / len(cycles) if cycles else 0
avg_percent = total_percent / len(cycles) if cycles else 0
return {
"field": field_name,
"average_used": avg_used,
"average_percent_used": avg_percent,
"total_cycles": len(cycles),
}
# Compute averages
token_stats = compute_averages(token_cycles, "tokens")
request_stats = compute_averages(request_cycles, "requests")
print(token_stats)
print(request_stats)