Upload 3 files

09c1340 verified about 1 year ago

27.1 kB

	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.utils.data import Dataset, DataLoader, random_split
	import pandas as pd
	import numpy as np
	import json
	import os
	import re
	import urllib.parse
	import matplotlib.pyplot as plt
	from collections import Counter
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	import tqdm

	# --- Healthcare URL Detection Components ---

	# Healthcare-related keywords for domain detection
	HEALTHCARE_KEYWORDS = [
	'health', 'medical', 'hospital', 'clinic', 'pharma', 'patient', 'care', 'med',
	'doctor', 'physician', 'nurse', 'therapy', 'rehab', 'dental', 'cardio', 'neuro',
	'oncology', 'pediatric', 'orthopedic', 'surgery', 'diagnostic', 'wellbeing',
	'wellness', 'ehr', 'emr', 'mychart', 'medicare', 'medicaid', 'insurance'
	]

	# Common healthcare institutions and systems
	HEALTHCARE_INSTITUTIONS = [
	'mayo', 'cleveland', 'hopkins', 'kaiser', 'mount sinai', 'cedars', 'baylor',
	'nhs', 'quest', 'labcorp', 'cvs', 'walgreens', 'aetna', 'cigna', 'unitedhealthcare',
	'bluecross', 'anthem', 'humana', 'va.gov', 'cdc', 'who', 'nih'
	]

	# Healthcare TLDs and specific domains
	HEALTHCARE_DOMAINS = ['.health', '.healthcare', '.medicine', '.hospital', '.clinic', 'mychart.']

	# --- Feature Extraction Functions ---

	def url_length(url):
	"""Return the length of the URL."""
	return len(url)

	def num_dots(url):
	"""Return the number of dots in the URL."""
	return url.count('.')

	def num_hyphens(url):
	"""Return the number of hyphens in the URL."""
	return url.count('-')

	def num_at(url):
	"""Return the number of @ symbols in the URL."""
	return url.count('@')

	def num_tilde(url):
	"""Return the number of ~ symbols in the URL."""
	return url.count('~')

	def num_underscore(url):
	"""Return the number of underscores in the URL."""
	return url.count('_')

	def num_percent(url):
	"""Return the number of percent symbols in the URL."""
	return url.count('%')

	def num_ampersand(url):
	"""Return the number of ampersands in the URL."""
	return url.count('&')

	def num_hash(url):
	"""Return the number of hash symbols in the URL."""
	return url.count('#')

	def has_https(url):
	"""Return 1 if the URL uses HTTPS, 0 otherwise."""
	return int(url.startswith('https://'))

	def has_ip_address(url):
	"""Check if the URL contains an IP address instead of a domain name."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', parsed_url.netloc):
	return 1
	# Check for IPv6
	if re.match(r'^\[[0-9a-fA-F:]+\]$', parsed_url.netloc):
	return 1
	return 0
	except:
	return 0

	def get_hostname_length(url):
	"""Return the length of the hostname."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	return len(parsed_url.netloc)
	except:
	return 0

	def get_path_length(url):
	"""Return the length of the path."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	return len(parsed_url.path)
	except:
	return 0

	def get_path_level(url):
	"""Return the number of directories in the path."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	return parsed_url.path.count('/')
	except:
	return 0

	def get_subdomain_level(url):
	"""Return the number of subdomains in the URL."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	hostname = parsed_url.netloc
	if has_ip_address(url):
	return 0 # IP addresses don't have subdomains

	parts = hostname.split('.')
	# Remove top-level and second-level domains
	if len(parts) > 2:
	return len(parts) - 2 # Count remaining parts as subdomain levels
	else:
	return 0 # No subdomains
	except:
	return 0

	def has_double_slash_in_path(url):
	"""Check if the path contains a double slash."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	return int('//' in parsed_url.path)
	except:
	return 0

	def get_tld(url):
	"""Extract the top-level domain from a URL."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	hostname = parsed_url.netloc.lower()
	parts = hostname.split('.')
	if len(parts) > 1:
	return parts[-1]
	return ''
	except:
	return ''

	def count_digits(url):
	"""Count the number of digits in the URL."""
	return sum(c.isdigit() for c in url)

	def digit_ratio(url):
	"""Calculate the ratio of digits to the total URL length."""
	if len(url) == 0:
	return 0
	return count_digits(url) / len(url)

	def count_letters(url):
	"""Count the number of letters in the URL."""
	return sum(c.isalpha() for c in url)

	def letter_ratio(url):
	"""Calculate the ratio of letters to the total URL length."""
	if len(url) == 0:
	return 0
	return count_letters(url) / len(url)

	def count_special_chars(url):
	"""Count the number of special characters in the URL."""
	return sum(not c.isalnum() and not c.isspace() for c in url)

	def special_char_ratio(url):
	"""Calculate the ratio of special characters to the total URL length."""
	if len(url) == 0:
	return 0
	return count_special_chars(url) / len(url)

	def get_query_length(url):
	"""Return the length of the query string."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	return len(parsed_url.query)
	except:
	return 0

	def get_fragment_length(url):
	"""Return the length of the fragment."""
	try:
	parsed_url = urllib.parse.urlparse(url)
	return len(parsed_url.fragment)
	except:
	return 0

	def healthcare_relevance_score(url):
	"""
	Calculate a relevance score for healthcare-related URLs.
	Higher scores indicate stronger relation to healthcare.
	"""
	url_lower = url.lower()
	parsed_url = urllib.parse.urlparse(url_lower)
	domain = parsed_url.netloc
	path = parsed_url.path

	score = 0

	# Check for healthcare keywords in domain
	for keyword in HEALTHCARE_KEYWORDS:
	if keyword in domain:
	score += 3
	elif keyword in path:
	score += 1

	# Check for healthcare institutions
	for institution in HEALTHCARE_INSTITUTIONS:
	if institution in domain:
	score += 4
	elif institution in path:
	score += 2

	# Check for healthcare-specific domains and TLDs
	for healthcare_domain in HEALTHCARE_DOMAINS:
	if healthcare_domain in domain:
	score += 3

	# Check for EHR/patient portal indicators
	if 'portal' in domain or 'portal' in path:
	score += 2
	if 'patient' in domain or 'mychart' in domain:
	score += 3
	if 'ehr' in domain or 'emr' in domain:
	score += 3

	# Normalize score to be between 0 and 1
	return min(score / 10.0, 1.0)

	def extract_features(url):
	"""Extract all features from a given URL."""
	features = [
	# Core features (the original 17)
	num_dots(url),
	get_subdomain_level(url),
	get_path_level(url),
	url_length(url),
	num_hyphens(url),
	num_at(url),
	num_tilde(url),
	num_underscore(url),
	num_percent(url),
	num_ampersand(url),
	num_hash(url),
	has_https(url),
	has_ip_address(url),
	get_hostname_length(url),
	get_path_length(url),
	has_double_slash_in_path(url),

	# Additional features
	digit_ratio(url),
	letter_ratio(url),
	special_char_ratio(url),
	get_query_length(url),
	get_fragment_length(url),
	healthcare_relevance_score(url)
	]
	return features

	def get_feature_names():
	"""Get names of all features in the order they are extracted."""
	return [
	'num_dots', 'subdomain_level', 'path_level', 'url_length',
	'num_hyphens', 'num_at', 'num_tilde', 'num_underscore',
	'num_percent', 'num_ampersand', 'num_hash', 'has_https',
	'has_ip_address', 'hostname_length', 'path_length',
	'double_slash_in_path', 'digit_ratio', 'letter_ratio',
	'special_char_ratio', 'query_length', 'fragment_length',
	'healthcare_relevance'
	]

	# --- Dataset Loading and Processing ---

	class URLDataset(Dataset):
	def __init__(self, features, labels):
	"""
	Custom PyTorch Dataset for URL features and labels.

	Args:
	features (numpy.ndarray): Feature vectors for each URL
	labels (numpy.ndarray): Labels for each URL (0 for benign, 1 for malicious)
	"""
	self.features = torch.tensor(features, dtype=torch.float32)
	self.labels = torch.tensor(labels, dtype=torch.long)

	def __len__(self):
	return len(self.labels)

	def __getitem__(self, idx):
	return self.features[idx], self.labels[idx]

	def load_huggingface_data(file_path):
	"""
	Load the Hugging Face dataset from a JSON file.

	Args:
	file_path: Path to the JSON file

	Returns:
	List of tuples containing (url, label)
	"""
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	url_data = []
	for item in data:
	url = item.get('text', '')
	label = item.get('label', -1)
	if url and label != -1: # Only add entries with valid URLs and labels
	url_data.append((url, label))

	print(f"Loaded {len(url_data)} URLs from Hugging Face dataset")
	return url_data

	def load_phiusiil_data(file_path):
	"""
	Load the PhiUSIIL dataset from a CSV file.

	Args:
	file_path: Path to the CSV file

	Returns:
	List of tuples containing (url, label)
	"""
	df = pd.read_csv(file_path)

	url_data = []
	for _, row in df.iterrows():
	url = row['URL']
	label = row['label']
	if isinstance(url, str) and url.strip() and not pd.isna(label):
	url_data.append((url, label))

	print(f"Loaded {len(url_data)} URLs from PhiUSIIL dataset")
	return url_data

	def load_kaggle_data(file_path):
	"""
	Load the Kaggle malicious_phish.csv dataset.

	Args:
	file_path: Path to the CSV file

	Returns:
	List of tuples containing (url, label)
	"""
	df = pd.read_csv(file_path)

	url_data = []
	for _, row in df.iterrows():
	url = row['url']
	type_val = row['type']

	# Convert to binary classification (0 for benign, 1 for all others)
	label = 0 if type_val.lower() == 'benign' else 1

	if isinstance(url, str) and url.strip():
	url_data.append((url, label))

	print(f"Loaded {len(url_data)} URLs from Kaggle dataset")
	return url_data

	def combine_and_deduplicate(datasets):
	"""
	Combine multiple datasets and remove duplicates by URL.

	Args:
	datasets: List of datasets, each containing (url, label) tuples

	Returns:
	Tuple of (urls, labels) with duplicates removed
	"""
	url_to_label = {}

	# Process each dataset
	for dataset in datasets:
	for url, label in dataset:
	# If we've seen this URL before with a different label,
	# prefer the malicious label (1) for safety
	if url in url_to_label:
	url_to_label[url] = max(url_to_label[url], label)
	else:
	url_to_label[url] = label

	# Convert to lists
	urls = list(url_to_label.keys())
	labels = list(url_to_label.values())

	print(f"After deduplication: {len(urls)} unique URLs")

	# Report class distribution
	label_counts = Counter(labels)
	print(f"Class distribution - Benign (0): {label_counts[0]}, Malicious (1): {label_counts[1]}")

	return urls, labels

	def extract_all_features(urls):
	"""
	Extract features from a list of URLs.

	Args:
	urls: List of URL strings

	Returns:
	Numpy array of feature vectors
	"""
	feature_vectors = []

	# Use tqdm for a progress bar
	for url in tqdm.tqdm(urls, desc="Extracting features"):
	try:
	features = extract_features(url)
	feature_vectors.append(features)
	except Exception as e:
	print(f"Error extracting features from {url}: {str(e)}")
	# Insert a vector of zeros in case of error
	feature_vectors.append([0] * len(get_feature_names()))

	return np.array(feature_vectors, dtype=np.float32)

	# --- MLP Model ---
	class PhishingMLP(nn.Module):
	def __init__(self, input_size=22, hidden_sizes=[22, 30, 10], output_size=1):
	"""
	Multilayer Perceptron for Phishing URL Detection.

	Args:
	input_size: Number of input features (default: 22)
	hidden_sizes: List of neurons in each hidden layer
	output_size: Number of output classes (1 for binary)
	"""
	super(PhishingMLP, self).__init__()

	self.layers = nn.ModuleList()

	# Input layer to first hidden layer
	self.layers.append(nn.Linear(input_size, hidden_sizes[0]))
	self.layers.append(nn.ReLU())

	# Hidden layers
	for i in range(len(hidden_sizes) - 1):
	self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
	self.layers.append(nn.ReLU())

	# Output layer
	self.layers.append(nn.Linear(hidden_sizes[-1], output_size))
	self.layers.append(nn.Sigmoid())

	def forward(self, x):
	"""Forward pass through the network."""
	for layer in self.layers:
	x = layer(x)
	return x

	# --- Training Functions ---
	def train_mlp(model, train_loader, val_loader, epochs=25, learning_rate=0.001, device="cpu"):
	"""
	Train the MLP model.

	Args:
	model: The MLP model
	train_loader: DataLoader for training data
	val_loader: DataLoader for validation data
	epochs: Number of training epochs
	learning_rate: Learning rate for optimization
	device: Device to train on (cpu or cuda)

	Returns:
	Tuple of (trained_model, train_losses, val_losses, val_accuracies)
	"""
	model.to(device)
	criterion = nn.BCELoss()
	optimizer = optim.Adam(model.parameters(), lr=learning_rate)

	train_losses = []
	val_losses = []
	val_accuracies = []

	print(f"Training on {device}...")
	for epoch in range(epochs):
	# Training phase
	model.train()
	running_loss = 0.0

	for inputs, labels in train_loader:
	inputs, labels = inputs.to(device), labels.to(device)

	# Zero the parameter gradients
	optimizer.zero_grad()

	# Forward + backward + optimize
	outputs = model(inputs)
	loss = criterion(outputs, labels.unsqueeze(1).float())
	loss.backward()
	optimizer.step()

	running_loss += loss.item()

	# Calculate average training loss
	epoch_train_loss = running_loss / len(train_loader)
	train_losses.append(epoch_train_loss)

	# Validation phase
	model.eval()
	val_loss = 0.0
	correct = 0
	total = 0

	with torch.no_grad():
	for inputs, labels in val_loader:
	inputs, labels = inputs.to(device), labels.to(device)
	outputs = model(inputs)

	# Calculate validation loss
	loss = criterion(outputs, labels.unsqueeze(1).float())
	val_loss += loss.item()

	# Calculate accuracy
	predicted = (outputs > 0.5).float()
	total += labels.size(0)
	correct += (predicted.squeeze() == labels.float()).sum().item()

	# Calculate average validation loss and accuracy
	epoch_val_loss = val_loss / len(val_loader)
	val_losses.append(epoch_val_loss)

	val_accuracy = 100 * correct / total
	val_accuracies.append(val_accuracy)

	# Print progress
	print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

	return model, train_losses, val_losses, val_accuracies

	def evaluate_model(model, test_loader, device):
	"""
	Evaluate the trained model on test data.

	Args:
	model: Trained model
	test_loader: DataLoader for test data
	device: Device to evaluate on

	Returns:
	Tuple of (accuracy, precision, recall, f1_score)
	"""
	model.to(device)
	model.eval()

	correct = 0
	total = 0
	true_positives = 0
	false_positives = 0
	false_negatives = 0
	healthcare_correct = 0
	healthcare_total = 0

	feature_idx = get_feature_names().index('healthcare_relevance')
	healthcare_threshold = 0.5 # Threshold for considering a URL healthcare-related

	with torch.no_grad():
	for inputs, labels in test_loader:
	inputs, labels = inputs.to(device), labels.to(device)

	# Forward pass
	outputs = model(inputs)
	predicted = (outputs > 0.5).float().squeeze()

	# Update counts
	total += labels.size(0)
	correct += (predicted == labels.float()).sum().item()

	# Metrics calculation
	for i in range(labels.size(0)):
	if labels[i] == 1 and predicted[i] == 1:
	true_positives += 1
	elif labels[i] == 0 and predicted[i] == 1:
	false_positives += 1
	elif labels[i] == 1 and predicted[i] == 0:
	false_negatives += 1

	# Check healthcare relevance
	if inputs[i, feature_idx] >= healthcare_threshold:
	healthcare_total += 1
	if predicted[i] == labels[i]:
	healthcare_correct += 1

	# Calculate metrics
	accuracy = 100 * correct / total
	precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
	recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
	f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

	# Healthcare-specific accuracy
	healthcare_accuracy = 100 * healthcare_correct / healthcare_total if healthcare_total > 0 else 0.0

	print(f"Overall Test Accuracy: {accuracy:.2f}%")
	print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
	print(f"Healthcare URLs identified: {healthcare_total} ({healthcare_total/total*100:.2f}%)")
	print(f"Healthcare URL Detection Accuracy: {healthcare_accuracy:.2f}%")

	return accuracy, precision, recall, f1, healthcare_accuracy

	def plot_training_results(train_losses, val_losses, val_accuracies):
	"""
	Plot training metrics.

	Args:
	train_losses: List of training losses
	val_losses: List of validation losses
	val_accuracies: List of validation accuracies
	"""
	plt.figure(figsize=(15, 5))

	# Plot losses
	plt.subplot(1, 2, 1)
	plt.plot(train_losses, label='Training Loss')
	plt.plot(val_losses, label='Validation Loss')
	plt.xlabel('Epoch')
	plt.ylabel('Loss')
	plt.title('Training and Validation Loss')
	plt.legend()

	# Plot accuracy
	plt.subplot(1, 2, 2)
	plt.plot(val_accuracies, label='Validation Accuracy')
	plt.xlabel('Epoch')
	plt.ylabel('Accuracy (%)')
	plt.title('Validation Accuracy')
	plt.legend()

	plt.tight_layout()
	plt.savefig('training_results.png')
	plt.show()

	def analyze_healthcare_features(features, labels, pred_labels):
	"""
	Analyze how the model performs on healthcare-related URLs.

	Args:
	features: Feature vectors
	labels: True labels
	pred_labels: Predicted labels
	"""
	healthcare_idx = get_feature_names().index('healthcare_relevance')
	healthcare_scores = features[:, healthcare_idx]

	# Define thresholds
	thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]

	print("\n=== Healthcare URL Analysis ===")
	print("Healthcare relevance score distribution:")
	for threshold in thresholds:
	count = np.sum(healthcare_scores >= threshold)
	percent = (count / len(healthcare_scores)) * 100
	print(f" Score >= {threshold}: {count} URLs ({percent:.2f}%)")

	# Analyze performance at different healthcare relevance levels
	for threshold in thresholds:
	mask = healthcare_scores >= threshold
	if np.sum(mask) == 0:
	continue

	h_labels = labels[mask]
	h_preds = pred_labels[mask]
	h_accuracy = np.mean(h_labels == h_preds) * 100

	benign_count = np.sum(h_labels == 0)
	malicious_count = np.sum(h_labels == 1)

	print(f"\nFor healthcare relevance >= {threshold}:")
	print(f" URLs: {np.sum(mask)} ({benign_count} benign, {malicious_count} malicious)")
	print(f" Accuracy: {h_accuracy:.2f}%")

	# Calculate healthcare-specific metrics
	tp = np.sum((h_labels == 1) & (h_preds == 1))
	fp = np.sum((h_labels == 0) & (h_preds == 1))
	fn = np.sum((h_labels == 1) & (h_preds == 0))

	precision = tp / (tp + fp) if (tp + fp) > 0 else 0
	recall = tp / (tp + fn) if (tp + fn) > 0 else 0
	f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

	print(f" Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

	# Calculate false positive rate for healthcare URLs
	if benign_count > 0:
	h_fpr = np.sum((h_labels == 0) & (h_preds == 1)) / benign_count
	print(f" False Positive Rate: {h_fpr:.4f}")

	# Calculate false negative rate for healthcare URLs
	if malicious_count > 0:
	h_fnr = np.sum((h_labels == 1) & (h_preds == 0)) / malicious_count
	print(f" False Negative Rate: {h_fnr:.4f}")

	# --- Main Function ---
	def main():
	"""Main function to run the entire pipeline."""
	# Configuration
	batch_size = 32
	learning_rate = 0.001
	epochs = 20
	test_size = 0.2
	val_size = 0.2
	random_seed = 42
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Filenames
	huggingface_file = "urls.json"
	phiusiil_file = "PhiUSIIL_Phishing_URL_Dataset.csv"
	kaggle_file = "malicious_phish.csv"

	# Load datasets
	print("Loading datasets...")
	huggingface_data = load_huggingface_data(huggingface_file)
	phiusiil_data = load_phiusiil_data(phiusiil_file)
	kaggle_data = load_kaggle_data(kaggle_file)

	# Combine and deduplicate datasets
	print("Combining and deduplicating datasets...")
	urls, labels = combine_and_deduplicate([huggingface_data, phiusiil_data, kaggle_data])

	# Extract features
	print("Extracting features...")
	features = extract_all_features(urls)

	# Split into train, validation, and test sets
	print("Splitting data...")
	X_train_val, X_test, y_train_val, y_test = train_test_split(
	features, labels, test_size=test_size, random_state=random_seed, stratify=labels
	)

	X_train, X_val, y_train, y_val = train_test_split(
	X_train_val, y_train_val, test_size=val_size/(1-test_size),
	random_state=random_seed, stratify=y_train_val
	)

	# Standardize features
	print("Standardizing features...")
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_val = scaler.transform(X_val)
	X_test = scaler.transform(X_test)

	# Create PyTorch datasets and dataloaders
	print("Creating DataLoaders...")
	train_dataset = URLDataset(X_train, y_train)
	val_dataset = URLDataset(X_val, y_val)
	test_dataset = URLDataset(X_test, y_test)

	train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
	val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
	test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

	# Initialize and train model
	print("Initializing model...")
	input_size = features.shape[1] # Number of features
	model = PhishingMLP(input_size=input_size)

	print("Training model...")
	trained_model, train_losses, val_losses, val_accuracies = train_mlp(
	model, train_loader, val_loader, epochs=epochs,
	learning_rate=learning_rate, device=device
	)

	# Save trained model
	print("Saving model...")
	model_path = "phishing_mlp_model.pth"
	torch.save(trained_model.state_dict(), model_path)
	print(f"Model saved to {model_path}")

	# Evaluate on test set
	print("\nEvaluating model on test set...")
	acc, prec, rec, f1, healthcare_acc = evaluate_model(trained_model, test_loader, device)

	# Plot results
	plot_training_results(train_losses, val_losses, val_accuracies)

	# Further healthcare analysis
	y_pred = []
	trained_model.eval()
	with torch.no_grad():
	for inputs, _ in test_loader:
	inputs = inputs.to(device)
	outputs = trained_model(inputs)
	predicted = (outputs > 0.5).float().squeeze().cpu().numpy()
	y_pred.extend(predicted.tolist())

	analyze_healthcare_features(X_test, np.array(y_test), np.array(y_pred))

	# Print feature importance summary
	feature_names = get_feature_names()
	healthcare_idx = feature_names.index('healthcare_relevance')
	healthcare_scores = features[:, healthcare_idx]
	high_healthcare = healthcare_scores >= 0.5

	print("\n=== Healthcare URL Examples ===")
	high_healthcare_indices = np.where(high_healthcare)[0][:5] # Get first 5 indices
	for idx in high_healthcare_indices:
	print(f"URL: {urls[idx]}")
	print(f"Healthcare Score: {healthcare_scores[idx]:.2f}")
	print(f"Label: {'Malicious' if labels[idx] == 1 else 'Benign'}")
	print()

	# Summary
	print("\n=== Summary ===")
	print(f"Total URLs processed: {len(urls)}")
	print(f"Training set: {len(X_train)} URLs")
	print(f"Validation set: {len(X_val)} URLs")
	print(f"Test set: {len(X_test)} URLs")
	print(f"Model input features: {input_size}")
	print(f"Test Accuracy: {acc:.2f}%")
	print(f"Healthcare URL Accuracy: {healthcare_acc:.2f}%")
	print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")
	print("\nTraining complete!")

	if __name__ == "__main__":
	main()