|
a |
|
b/Tuning_hyperparameters/DNNGP_OPN.py |
|
|
1 |
# DNNGP3 tuning hyperparameters script |
|
|
2 |
import os |
|
|
3 |
import re |
|
|
4 |
import json |
|
|
5 |
import subprocess |
|
|
6 |
import numpy as np |
|
|
7 |
import nevergrad as ng |
|
|
8 |
import tensorflow as tf |
|
|
9 |
# The script needs to set parameters in three places, one is the #10 directory location, the second is the #21 hyperparameter search space, and the third is the #49 DNNGP native command. |
|
|
10 |
# Set priorities in descending order, except for the directory, the default parameters are sufficient for most requests. |
|
|
11 |
# Define directories and file paths |
|
|
12 |
output_dir = r'..\Output_files' |
|
|
13 |
pkl_file = r"..\Input_files\wheat599_pc95.pkl" |
|
|
14 |
budget = 200 # Optimize the number of script iterations |
|
|
15 |
alpha = 0.7 # Adjust this weight to balance the importance of mean and variance in the optimization process |
|
|
16 |
beta = 0.1 # This parameter is adjusted to control the nonlinear effect of the deviation in the optimization process |
|
|
17 |
cvs = 10 # K-fold cross-validation |
|
|
18 |
|
|
|
19 |
pkl_dir = os.path.dirname(pkl_file) |
|
|
20 |
# Obtain all tsv files in the directory where the pkl file resides |
|
|
21 |
tsv_files = [f for f in os.listdir(pkl_dir) if f.endswith('.tsv')] |
|
|
22 |
|
|
|
23 |
def check_gpu_available(): |
|
|
24 |
"""Check and display GPU availability information""" |
|
|
25 |
gpus = tf.config.list_physical_devices('GPU') |
|
|
26 |
if gpus: |
|
|
27 |
print("🎉 GPU is available!") |
|
|
28 |
for idx, gpu in enumerate(gpus): |
|
|
29 |
print(f"[Device {idx}]") |
|
|
30 |
print(f" Name: {gpu.name}") |
|
|
31 |
try: |
|
|
32 |
details = tf.config.experimental.get_device_details(gpu) |
|
|
33 |
print(f" Compute Capability: {details.get('compute_capability')}") |
|
|
34 |
print(f" Device Type: {details.get('device_type', 'N/A')}") |
|
|
35 |
except AttributeError: |
|
|
36 |
print(" Unable to retrieve detailed device information (may require TensorFlow version upgrade)") |
|
|
37 |
return True |
|
|
38 |
else: |
|
|
39 |
print("⚠️ No GPU detected, will use CPU") |
|
|
40 |
return False |
|
|
41 |
check_gpu_available() |
|
|
42 |
# Define hyperparameters search space (see https://github.com/facebookresearch/nevergrad) |
|
|
43 |
instr = ng.p.Instrumentation( |
|
|
44 |
batch_size=ng.p.Scalar(lower=32, upper=1024).set_integer_casting(), |
|
|
45 |
lr=ng.p.Log(lower=1e-4, upper=1), |
|
|
46 |
patience=ng.p.Scalar(lower=10, upper=50).set_integer_casting(), |
|
|
47 |
dropout1=ng.p.Log(lower=0.01, upper=0.9), |
|
|
48 |
dropout2=ng.p.Log(lower=0.01, upper=0.9), |
|
|
49 |
earlystopping=ng.p.Scalar(lower=50, upper=100).set_integer_casting() |
|
|
50 |
) |
|
|
51 |
|
|
|
52 |
# Define a function to extract statistic values |
|
|
53 |
|
|
|
54 |
def extract_statistics(output): |
|
|
55 |
statistics = re.findall(r'statistic=([-+]?[0-9]*\.?[0-9]+)', output) |
|
|
56 |
if not statistics: |
|
|
57 |
return 0.0 |
|
|
58 |
statistic_values = float(statistics[0]) |
|
|
59 |
return statistic_values |
|
|
60 |
|
|
|
61 |
# Define the objective function |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
def objective(batch_size: int, lr: float, patience: int, dropout1: float, dropout2: float, earlystopping: int, tsv_file: str): |
|
|
65 |
accuracies = [] |
|
|
66 |
print('batch:',batch_size, 'lr:', lr, 'patience:', patience, 'dropout1:', dropout1, 'dropout2:', dropout2, 'earlystopping:', earlystopping, 'tsv_file:', tsv_file) |
|
|
67 |
|
|
|
68 |
for part in range(1, cvs + 1): |
|
|
69 |
command = f"python ../Scripts/dnngp_runner.py --batch_size {batch_size} --epoch 10000 --lr {lr} --patience {patience} --dropout1 {dropout1} --dropout2 {dropout2} --earlystopping {earlystopping} --cv {cvs} --part {part} --snp {pkl_file} --pheno {os.path.join(pkl_dir, tsv_file)} --output {output_dir}" |
|
|
70 |
print(command) |
|
|
71 |
p = subprocess.Popen(command, shell=True, |
|
|
72 |
stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
|
|
73 |
output, error = p.communicate() |
|
|
74 |
|
|
|
75 |
# Decode output |
|
|
76 |
output_str = output.decode(errors='ignore') |
|
|
77 |
error_str = error.decode(errors='ignore') |
|
|
78 |
|
|
|
79 |
if error_str: |
|
|
80 |
print("Error Output:", error_str) |
|
|
81 |
|
|
|
82 |
accuracy = extract_statistics(output_str) |
|
|
83 |
accuracies.append(accuracy) |
|
|
84 |
print("Statistic values for all folds", accuracies) |
|
|
85 |
|
|
|
86 |
mean_accuracy = np.mean(accuracies) if accuracies else 0.0 |
|
|
87 |
var_accuracy = np.var(accuracies) if accuracies else 0.0 |
|
|
88 |
|
|
|
89 |
# Use a weighted combination to balance the mean and variance |
|
|
90 |
# Use a nonlinear transform to adjust for the effect of variance |
|
|
91 |
|
|
|
92 |
combined_metric = alpha * mean_accuracy - \ |
|
|
93 |
(1 - alpha) * np.exp(beta * var_accuracy) |
|
|
94 |
return -combined_metric |
|
|
95 |
|
|
|
96 |
|
|
|
97 |
# Record the best parameters and results for each tsv file |
|
|
98 |
best_params_per_tsv = {} |
|
|
99 |
|
|
|
100 |
for tsv_file in tsv_files: |
|
|
101 |
print(f"Optimizing for TSV file: {tsv_file}") |
|
|
102 |
# Use Nevergrad's optimizer |
|
|
103 |
optimizer = ng.optimizers.NGOpt(parametrization=instr, budget=budget) |
|
|
104 |
# Execution optimization procedure |
|
|
105 |
recommendation = optimizer.minimize( |
|
|
106 |
lambda *args, **kwargs: objective(*args, **kwargs, tsv_file=tsv_file) |
|
|
107 |
) |
|
|
108 |
# Output optimum parameter |
|
|
109 |
print(f"Best parameters for {tsv_file}:", recommendation.value) |
|
|
110 |
best_params_per_tsv[tsv_file] = recommendation.value |
|
|
111 |
|
|
|
112 |
# Output best_params_per_tsv to a JSON file |
|
|
113 |
output_json_file = os.path.join(pkl_dir, 'best_params_per_tsv.json') |
|
|
114 |
with open(output_json_file, 'w') as file: |
|
|
115 |
json.dump(best_params_per_tsv, file, indent=4) |
|
|
116 |
print(f"Best parameters saved to {output_json_file}") |