LigUnity / active_learning_scripts /run_cycle_one_model.py

提交LigUnity初始代码

94391f2 11 days ago

9.23 kB

	import pandas as pd
	import numpy as np
	import subprocess
	import os
	from pathlib import Path
	import random
	import argparse
	import json


	def parse_arguments():
	parser = argparse.ArgumentParser(description='Active Learning Cycle for Ligand Prediction')

	# Input/Output arguments
	parser.add_argument('--input_file', type=str, required=True,
	help='Input CSV file containing ligand data (e.g., tyk2_fep.csv)')
	parser.add_argument('--results_dir', type=str, required=True,
	help='Base directory for storing all results')
	parser.add_argument('--al_batch_size', type=int, required=True,
	help='Number of samples for each active learning batch')

	# Experiment configuration
	parser.add_argument('--num_repeats', type=int, default=5,
	help='Number of repeated experiments (default: 5)')
	parser.add_argument('--num_cycles', type=int, required=True,
	help='Number of active learning cycles')

	# Model configuration
	parser.add_argument('--arch', type=str, required=True,
	help='Model architecture')
	parser.add_argument('--weight_path', type=str, required=True,
	help='Path to pretrained model weights')
	parser.add_argument('--lr', type=float, default=0.001,
	help='Learning rate (default: 0.001)')
	parser.add_argument('--master_port', type=int, default=29500,
	help='Master port for distributed training (default: 29500)')
	parser.add_argument('--device', type=int, default=0,
	help='Device to run the model on (default: cuda:0)')
	parser.add_argument('--begin_greedy', type=int, default=0,
	help='iter of begin to be pure greedy, using half greedy before')

	# Random seed
	parser.add_argument('--base_seed', type=int, default=42,
	help='Base random seed (default: 42)')

	return parser.parse_args()


	def run_model(arch, weight_path, results_path, result_file, lr, master_port, train_ligf, test_ligf, device):
	import os
	project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	cmd = [
	"bash", "./active_learning_scripts/run_model.sh",
	arch,
	weight_path,
	results_path,
	result_file,
	str(lr),
	str(master_port),
	train_ligf,
	test_ligf,
	str(device)
	]
	subprocess.run(cmd, check=True, cwd=project_root)


	def prepare_initial_split(input_file, results_dir, al_batch_size, repeat_idx, cycle_idx, base_seed):
	# Read all ligands
	df = pd.read_csv(input_file)

	# Set random seed for reproducibility
	random.seed(base_seed + repeat_idx) # Different seed for each repeat

	# Randomly select ligands for training and testing
	all_indices = list(range(len(df)))
	train_indices = random.sample(all_indices, al_batch_size)
	test_indices = [i for i in all_indices if i not in train_indices]

	# Create train and test files
	train_df = df.iloc[train_indices]
	test_df = df.iloc[test_indices]

	# Create file names with repeat and cycle information
	train_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
	test_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")

	# Create directory if it doesn't exist
	os.makedirs(os.path.dirname(train_file), exist_ok=True)

	# Save files
	train_df.to_csv(train_file, index=False)
	test_df.to_csv(test_file, index=False)

	return train_file, test_file


	def read_jsonl_predictions(results_path, result_file):
	"""
	Read predictions from jsonl file and calculate average predictions
	Returns a dictionary mapping SMILES to average predictions
	"""
	predictions = {}
	all_predictions = []
	smiles_list = None

	jsonl_path = os.path.join(results_path, result_file)
	with open(jsonl_path, 'r') as f:
	# Read first line to get SMILES list
	first_line = f.readline()
	smiles_list = json.loads(first_line.strip())["tyk2"]["smiles"]

	# Read rest of lines containing predictions
	for line in f:
	pred_line = json.loads(line.strip())
	all_predictions.append(pred_line["tyk2"]["pred"])

	# Convert to numpy array for easier computation
	pred_array = np.array(all_predictions)
	# Calculate mean predictions
	mean_predictions = np.mean(pred_array, axis=0)

	# Create dictionary mapping SMILES to average predictions
	for smile, pred in zip(smiles_list, mean_predictions):
	predictions[smile] = float(pred)

	return predictions


	def update_splits(results_dir, results_path, result_file, prev_train_file, prev_test_file, repeat_idx, cycle_idx,
	al_batch_size, begin_greedy):
	# Read predictions from jsonl file
	predictions = read_jsonl_predictions(results_path, result_file)

	# Read previous test file
	test_df = pd.read_csv(prev_test_file)

	# Add predictions to test_df
	test_df['prediction'] = test_df['Smiles'].map(predictions)

	# Sort by predictions (high to low)
	test_df_sorted = test_df.sort_values('prediction', ascending=False)

	# Read previous train file
	train_df = pd.read_csv(prev_train_file)

	# Create new file names
	new_train_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
	new_test_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")

	# Create directory if it doesn't exist
	os.makedirs(os.path.dirname(new_train_file), exist_ok=True)

	if cycle_idx >= begin_greedy:
	# Take top al_batch_size compounds for training
	new_train_compounds = test_df_sorted.head(al_batch_size)
	remaining_test_compounds = test_df_sorted.iloc[al_batch_size:]
	else:
	# use half greedy approach
	new_train_compounds_tmp_1 = test_df_sorted.head(al_batch_size//2)
	remaining_test_compounds_tmp = test_df_sorted.iloc[al_batch_size//2:]
	all_indices = list(range(len(remaining_test_compounds_tmp)))

	train_indices = random.sample(all_indices, al_batch_size - al_batch_size//2)
	test_indices = [i for i in all_indices if i not in train_indices]
	remaining_test_compounds = remaining_test_compounds_tmp.iloc[test_indices]
	new_train_compounds_tmp_2 = remaining_test_compounds_tmp.iloc[train_indices]
	new_train_compounds = pd.concat([new_train_compounds_tmp_1, new_train_compounds_tmp_2])


	# Combine with previous training data
	combined_train_df = pd.concat([train_df, new_train_compounds])

	for _ in range(3):
	print("########################################")
	print("Cycling: ", cycle_idx)
	print("top_1p: {}/100".format(combined_train_df['top_1p'].sum()))
	print("top_2p: {}/200".format(combined_train_df['top_2p'].sum()))
	print("top_5p: {}/500".format(combined_train_df['top_5p'].sum()))

	# Save files
	combined_train_df.to_csv(new_train_file, index=False)
	remaining_test_compounds.to_csv(new_test_file, index=False)

	return new_train_file, new_test_file


	def run_active_learning(args):
	# Create base results directory
	os.system(f"rm -rf {args.results_dir}")
	os.makedirs(args.results_dir, exist_ok=True)

	for repeat_idx in range(args.num_repeats):
	print(f"Starting repeat {repeat_idx}")

	# Initial split for this repeat
	train_file, test_file = prepare_initial_split(
	args.input_file,
	args.results_dir,
	args.al_batch_size,
	repeat_idx,
	0, # First cycle
	args.base_seed
	)

	for cycle_idx in range(args.num_cycles):
	print(f"Running cycle {cycle_idx} for repeat {repeat_idx}")

	# Create results directory for this cycle
	results_path = args.results_dir

	# Result file name
	result_file = f"repeat_{repeat_idx}_cycle_{cycle_idx}_results.jsonl"
	if os.path.exists(f"{args.results_dir}/{result_file}"):
	os.remove(f"{args.results_dir}/{result_file}")

	# Run the model
	run_model(
	arch=args.arch,
	weight_path=args.weight_path,
	results_path=results_path,
	result_file=result_file,
	lr=args.lr,
	master_port=args.master_port,
	train_ligf=train_file,
	test_ligf=test_file,
	device=args.device
	)

	# Update splits for next cycle
	if cycle_idx < args.num_cycles - 1: # Don't update after last cycle
	train_file, test_file = update_splits(
	args.results_dir,
	results_path,
	result_file,
	train_file,
	test_file,
	repeat_idx,
	cycle_idx + 1,
	args.al_batch_size,
	args.begin_greedy
	)


	if __name__ == "__main__":
	args = parse_arguments()
	run_active_learning(args)