Spaces:

deedrop1140
/

MachineLearningAlgorithms

Running

App Files Files Community

MachineLearningAlgorithms / data /K-Means.json

deedrop1140

Upload 41 files

0d00d62 verified 3 months ago

raw

history blame contribute delete

49.5 kB

	{
	"title": "K-Means Clustering Mastery: 100 MCQs",
	"description": "A comprehensive set of 100 multiple-choice questions designed to test and deepen your understanding of K-Means Clustering, covering basic concepts, algorithm steps, practical scenarios, and challenges in high-dimensional or real-world datasets.",
	"questions": [
	{
	"id": 1,
	"questionText": "What is the primary goal of K-Means Clustering?",
	"options": [
	"Partition data into K clusters minimizing within-cluster variance",
	"Reduce the dimensionality of the dataset",
	"Detect outliers in the dataset",
	"Classify data into predefined categories"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means aims to divide data into K clusters such that the sum of squared distances between points and their cluster centroid is minimized."
	},
	{
	"id": 2,
	"questionText": "In K-Means, what does a 'centroid' represent?",
	"options": [
	"A random point from the dataset",
	"The farthest point from the cluster",
	"The maximum value in the cluster",
	"The mean position of all points in the cluster"
	],
	"correctAnswerIndex": 3,
	"explanation": "Centroid is the mean of all points in a cluster and represents the cluster's center."
	},
	{
	"id": 3,
	"questionText": "Which step is repeated in K-Means until convergence?",
	"options": [
	"Compute correlation matrix",
	"Assign points to nearest centroid and update centroids",
	"Remove outliers",
	"Randomly shuffle data points"
	],
	"correctAnswerIndex": 1,
	"explanation": "K-Means iteratively assigns points to the nearest centroid and recalculates centroids until assignments stabilize."
	},
	{
	"id": 4,
	"questionText": "Scenario: K-Means converges but clusters are uneven in size. Likely reason?",
	"options": [
	"Centroids are incorrect",
	"Distance metric used is Euclidean",
	"Data distribution is skewed",
	"Algorithm failed"
	],
	"correctAnswerIndex": 2,
	"explanation": "K-Means partitions based on distances; skewed or non-spherical distributions can lead to uneven cluster sizes."
	},
	{
	"id": 5,
	"questionText": "What is the main limitation of K-Means clustering?",
	"options": [
	"Sensitive to outliers",
	"Requires predefined number of clusters (K)",
	"Only works for numerical data",
	"All of the above"
	],
	"correctAnswerIndex": 3,
	"explanation": "K-Means works only with numerical data, needs K as input, and is sensitive to outliers."
	},
	{
	"id": 6,
	"questionText": "Scenario: K-Means applied to customer locations. Distance metric to use?",
	"options": [
	"Euclidean distance",
	"Hamming distance",
	"Cosine similarity",
	"Jaccard index"
	],
	"correctAnswerIndex": 0,
	"explanation": "Euclidean distance is standard for K-Means and spatial numerical data."
	},
	{
	"id": 7,
	"questionText": "Scenario: K-Means on 2D points results vary with different initial centroids. Solution?",
	"options": [
	"Use hierarchical clustering instead",
	"Ignore initial centroids",
	"Reduce K",
	"Use K-Means++ initialization"
	],
	"correctAnswerIndex": 3,
	"explanation": "K-Means++ selects better initial centroids to improve convergence and consistency."
	},
	{
	"id": 8,
	"questionText": "Scenario: K-Means on concentric circles fails. Reason?",
	"options": [
	"K-Means assumes spherical clusters",
	"Data contains outliers",
	"Distance metric wrong",
	"Algorithm converged too quickly"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means works best for convex, spherical clusters; it cannot separate concentric circular clusters."
	},
	{
	"id": 9,
	"questionText": "Scenario: After K-Means clustering, silhouette score is low. Interpretation?",
	"options": [
	"Noise ignored automatically",
	"Clusters overlap or poorly defined",
	"Clusters are perfect",
	"Algorithm converged correctly"
	],
	"correctAnswerIndex": 1,
	"explanation": "Low silhouette score indicates points are close to neighboring cluster centroids; clusters are not well separated."
	},
	{
	"id": 10,
	"questionText": "Scenario: Large dataset with millions of points. K-Means limitation?",
	"options": [
	"Algorithm fails completely",
	"Distance metric is irrelevant",
	"Cannot calculate centroids",
	"Convergence can be slow; consider Mini-Batch K-Means"
	],
	"correctAnswerIndex": 3,
	"explanation": "Mini-Batch K-Means is a faster variant suitable for large datasets."
	},
	{
	"id": 11,
	"questionText": "Scenario: K-Means on a dataset with outliers. Effect?",
	"options": [
	"Algorithm removes outliers",
	"Centroids can shift towards outliers, distorting clusters",
	"Clusters become more compact",
	"Clusters ignore outliers automatically"
	],
	"correctAnswerIndex": 1,
	"explanation": "Outliers can disproportionately affect centroids, leading to poorly defined clusters."
	},
	{
	"id": 12,
	"questionText": "Scenario: K-Means on categorical data. Limitation?",
	"options": [
	"K-Means requires numerical data; cannot handle categorical directly",
	"Clusters merge randomly",
	"Categorical data improves clustering",
	"Algorithm automatically encodes categories"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means relies on distance metrics, which are not directly defined for categorical data."
	},
	{
	"id": 13,
	"questionText": "Scenario: Selecting K for K-Means. Which method helps?",
	"options": [
	"Merge dendrograms",
	"Random selection",
	"Elbow method",
	"Silhouette ignored"
	],
	"correctAnswerIndex": 2,
	"explanation": "The Elbow method plots sum of squared errors vs K and identifies an 'elbow' point as optimal K."
	},
	{
	"id": 14,
	"questionText": "Scenario: K-Means fails to separate overlapping clusters. Likely reason?",
	"options": [
	"K too small",
	"Centroids are optimal",
	"Algorithm converged correctly",
	"Clusters are not well-separated or non-convex"
	],
	"correctAnswerIndex": 3,
	"explanation": "K-Means works best for well-separated convex clusters; overlapping clusters are challenging."
	},
	{
	"id": 15,
	"questionText": "Scenario: K-Means with high-dimensional data. Challenge?",
	"options": [
	"Algorithm fails automatically",
	"Noise ignored",
	"Distance metrics lose meaning; consider PCA or feature scaling",
	"Clusters are always compact"
	],
	"correctAnswerIndex": 2,
	"explanation": "High-dimensional spaces dilute distances; dimensionality reduction improves clustering."
	},
	{
	"id": 16,
	"questionText": "Scenario: K-Means with K too large. Effect?",
	"options": [
	"Algorithm automatically reduces K",
	"Clusters always improve",
	"Clusters may become small and meaningless",
	"Noise ignored"
	],
	"correctAnswerIndex": 2,
	"explanation": "Choosing K too large can lead to many tiny clusters with no meaningful pattern."
	},
	{
	"id": 17,
	"questionText": "Scenario: Mini-Batch K-Means. Advantage?",
	"options": [
	"Faster for large datasets with approximate centroids",
	"Removes noise automatically",
	"More accurate than standard K-Means",
	"Works only on small datasets"
	],
	"correctAnswerIndex": 0,
	"explanation": "Mini-Batch K-Means updates centroids using small random batches for efficiency on large datasets."
	},
	{
	"id": 18,
	"questionText": "Scenario: K-Means initialization affects results. Solution?",
	"options": [
	"Use single random centroid only",
	"Run algorithm multiple times or use K-Means++",
	"Ignore initialization",
	"Reduce K randomly"
	],
	"correctAnswerIndex": 1,
	"explanation": "K-Means++ and multiple runs improve stability and reduce sensitivity to initial centroids."
	},
	{
	"id": 19,
	"questionText": "Scenario: K-Means distance metric. Standard choice?",
	"options": [
	"Euclidean distance",
	"Jaccard index",
	"Hamming distance",
	"Cosine similarity"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means typically uses Euclidean distance to assign points to nearest centroids."
	},
	{
	"id": 20,
	"questionText": "Scenario: K-Means clustering produces empty clusters. Cause?",
	"options": [
	"Algorithm failed",
	"Clusters are compact",
	"No points assigned to some centroids",
	"Centroids are optimal"
	],
	"correctAnswerIndex": 2,
	"explanation": "Some centroids may not attract any points, leading to empty clusters."
	},
	{
	"id": 21,
	"questionText": "Scenario: K-Means on text embeddings. Preprocessing required?",
	"options": [
	"Use categorical K-Means directly",
	"No preprocessing needed",
	"Randomly assign clusters",
	"Normalize or scale vectors before clustering"
	],
	"correctAnswerIndex": 3,
	"explanation": "Text embeddings often need normalization to prevent certain dimensions from dominating distance computations."
	},
	{
	"id": 22,
	"questionText": "Scenario: K-Means with very high K. Effect on SSE (sum of squared errors)?",
	"options": [
	"SSE increases",
	"SSE is ignored",
	"SSE decreases as K increases",
	"SSE remains constant"
	],
	"correctAnswerIndex": 2,
	"explanation": "As K increases, each cluster contains fewer points, reducing the sum of squared distances."
	},
	{
	"id": 23,
	"questionText": "Scenario: K-Means on scaled vs unscaled features. Effect?",
	"options": [
	"Clusters merge randomly",
	"Scaling is important; features with large range dominate clustering",
	"Algorithm fails if not scaled",
	"Scaling is unnecessary"
	],
	"correctAnswerIndex": 1,
	"explanation": "Features with larger numerical ranges can dominate Euclidean distance; scaling ensures fair contribution."
	},
	{
	"id": 24,
	"questionText": "Scenario: K-Means convergence criteria. Standard check?",
	"options": [
	"Centroid positions or cluster assignments stop changing",
	"Distance metric ignored",
	"Random stopping",
	"Maximum iterations only"
	],
	"correctAnswerIndex": 0,
	"explanation": "Algorithm stops when centroids or cluster assignments stabilize, or after a max number of iterations."
	},
	{
	"id": 25,
	"questionText": "Scenario: K-Means for image compression. How?",
	"options": [
	"Cluster pixel colors and replace each pixel by its centroid",
	"Use hierarchical clustering",
	"Remove noise automatically",
	"Reduce image resolution"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means clusters similar colors, allowing image compression by using cluster centroids as representative colors."
	},
	{
	"id": 26,
	"questionText": "Scenario: K-Means for customer segmentation. Benefit?",
	"options": [
	"Identify customer groups for targeted marketing",
	"Automatically predicts sales",
	"Detects trends over time",
	"Removes outliers"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means helps segment customers based on behavior, allowing targeted campaigns."
	},
	{
	"id": 27,
	"questionText": "Scenario: K-Means clustering results differ on repeated runs. Cause?",
	"options": [
	"Distance metric varies",
	"Random initialization of centroids",
	"Algorithm deterministic",
	"Clusters merge randomly"
	],
	"correctAnswerIndex": 1,
	"explanation": "Random initial centroids can lead to different final clusters; K-Means++ mitigates this."
	},
	{
	"id": 28,
	"questionText": "Scenario: K-Means for anomaly detection. Approach?",
	"options": [
	"Clusters merge randomly",
	"Points far from nearest centroid may be anomalies",
	"All points treated equally",
	"Noise automatically ignored"
	],
	"correctAnswerIndex": 1,
	"explanation": "Outliers are detected as points distant from cluster centroids."
	},
	{
	"id": 29,
	"questionText": "Scenario: K-Means clustering on geospatial data. Best practice?",
	"options": [
	"Use distance metric appropriate for coordinates (e.g., haversine)",
	"Randomly assign clusters",
	"Use Euclidean blindly",
	"Clusters merge arbitrarily"
	],
	"correctAnswerIndex": 0,
	"explanation": "Euclidean distance may misrepresent geographic distances; use geodesic metrics like haversine."
	},
	{
	"id": 30,
	"questionText": "Scenario: K-Means with highly correlated features. Solution?",
	"options": [
	"Increase K randomly",
	"Apply PCA to reduce correlated dimensions",
	"Ignore correlations",
	"Clusters merge arbitrarily"
	],
	"correctAnswerIndex": 1,
	"explanation": "PCA reduces correlated features and improves clustering performance."
	},
	{
	"id": 31,
	"questionText": "Scenario: K-Means on non-spherical clusters. Limitation?",
	"options": [
	"Noise ignored",
	"Clusters are always compact",
	"Algorithm automatically adapts",
	"K-Means assumes spherical clusters; non-spherical clusters may be poorly separated"
	],
	"correctAnswerIndex": 3,
	"explanation": "K-Means relies on Euclidean distance and assumes roughly spherical clusters, so elongated or irregular clusters are not well captured."
	},
	{
	"id": 32,
	"questionText": "Scenario: K-Means clustering produces clusters with very different densities. Challenge?",
	"options": [
	"Distance metric ignored",
	"Clusters always equal",
	"Low-density clusters may be merged incorrectly",
	"Algorithm detects densities automatically"
	],
	"correctAnswerIndex": 2,
	"explanation": "K-Means does not account for density; clusters with differing densities may not be separated properly."
	},
	{
	"id": 33,
	"questionText": "Scenario: K-Means applied to text embeddings. Best practice?",
	"options": [
	"Increase K arbitrarily",
	"Normalize embeddings to unit vectors before clustering",
	"Remove half the features randomly",
	"Use raw embeddings"
	],
	"correctAnswerIndex": 1,
	"explanation": "Normalization ensures that distance computation reflects angle similarity rather than magnitude differences."
	},
	{
	"id": 34,
	"questionText": "Scenario: K-Means clustering with missing values. Approach?",
	"options": [
	"Ignore missing values",
	"Randomly assign missing values",
	"Algorithm automatically handles them",
	"Impute missing values before clustering"
	],
	"correctAnswerIndex": 3,
	"explanation": "K-Means requires complete numerical data; missing values should be imputed or removed."
	},
	{
	"id": 35,
	"questionText": "Scenario: K-Means clustering on multi-dimensional customer features. Preprocessing step?",
	"options": [
	"Randomly drop features",
	"Scale features so all dimensions contribute equally",
	"Increase K arbitrarily",
	"Leave features unscaled"
	],
	"correctAnswerIndex": 1,
	"explanation": "Feature scaling ensures that dimensions with larger ranges do not dominate Euclidean distance."
	},
	{
	"id": 36,
	"questionText": "Scenario: K-Means on a dataset with outliers. Solution?",
	"options": [
	"Use standard K-Means without changes",
	"Randomly assign clusters",
	"Increase K to compensate",
	"Remove or preprocess outliers before clustering"
	],
	"correctAnswerIndex": 3,
	"explanation": "Outliers can distort centroids; preprocessing improves clustering accuracy."
	},
	{
	"id": 37,
	"questionText": "Scenario: K-Means convergence too slow. Solution?",
	"options": [
	"Ignore convergence",
	"Change distance metric arbitrarily",
	"Use Mini-Batch K-Means or reduce dataset size",
	"Increase K randomly"
	],
	"correctAnswerIndex": 2,
	"explanation": "Mini-Batch K-Means or subsampling speeds up convergence for large datasets."
	},
	{
	"id": 38,
	"questionText": "Scenario: K-Means clustering with highly correlated features. Best approach?",
	"options": [
	"Increase K",
	"Ignore correlation",
	"Merge clusters arbitrarily",
	"Apply PCA or feature selection to reduce redundancy"
	],
	"correctAnswerIndex": 3,
	"explanation": "Reducing correlated dimensions prevents redundant information from biasing distance calculations."
	},
	{
	"id": 39,
	"questionText": "Scenario: K-Means clustering on skewed data. Issue?",
	"options": [
	"Noise ignored",
	"Algorithm corrects automatically",
	"Clusters may be biased towards dense regions",
	"Clusters always balanced"
	],
	"correctAnswerIndex": 2,
	"explanation": "Skewed distributions can lead to unequal cluster sizes or poorly defined boundaries."
	},
	{
	"id": 40,
	"questionText": "Scenario: K-Means with K unknown. Methods to select K?",
	"options": [
	"Random choice",
	"Algorithm decides automatically",
	"Use maximum data points",
	"Elbow method, silhouette score, gap statistic"
	],
	"correctAnswerIndex": 3,
	"explanation": "These methods help determine optimal K by evaluating clustering performance."
	},
	{
	"id": 41,
	"questionText": "Scenario: K-Means produces very similar clusters on repeated runs. Possible reason?",
	"options": [
	"Algorithm converged incorrectly",
	"Data naturally forms stable clusters",
	"Distance metric is wrong",
	"Initialization randomization failed"
	],
	"correctAnswerIndex": 1,
	"explanation": "If data has well-separated clusters, K-Means results are stable across runs."
	},
	{
	"id": 42,
	"questionText": "Scenario: K-Means on a small dataset with large K. Risk?",
	"options": [
	"Algorithm fails completely",
	"Centroids ignored",
	"Clusters may be too small or empty",
	"Clusters automatically merge"
	],
	"correctAnswerIndex": 2,
	"explanation": "Too many clusters for small datasets can produce meaningless or empty clusters."
	},
	{
	"id": 43,
	"questionText": "Scenario: K-Means++ initialization. Benefit?",
	"options": [
	"Improves cluster quality by selecting distant initial centroids",
	"Random initialization",
	"Always produces identical clusters",
	"Removes noise automatically"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means++ reduces poor initialization by spreading centroids apart."
	},
	{
	"id": 44,
	"questionText": "Scenario: K-Means with categorical features. Solution?",
	"options": [
	"Use K-Prototypes or encode categories numerically",
	"Clusters merge randomly",
	"Ignore categorical data",
	"Use standard K-Means directly"
	],
	"correctAnswerIndex": 0,
	"explanation": "Standard K-Means cannot handle categorical data; K-Prototypes or encoding is needed."
	},
	{
	"id": 45,
	"questionText": "Scenario: K-Means on noisy sensor data. Best practice?",
	"options": [
	"Use raw data",
	"Increase K arbitrarily",
	"Filter or preprocess noise before clustering",
	"Ignore convergence"
	],
	"correctAnswerIndex": 2,
	"explanation": "Noise affects centroids and cluster assignment; preprocessing improves results."
	},
	{
	"id": 46,
	"questionText": "Scenario: K-Means for image segmentation. Metric for colors?",
	"options": [
	"Cosine similarity",
	"Euclidean distance in RGB or LAB space",
	"Hamming distance",
	"Jaccard index"
	],
	"correctAnswerIndex": 1,
	"explanation": "Euclidean distance is standard for numerical pixel features in color space."
	},
	{
	"id": 47,
	"questionText": "Scenario: K-Means convergence to local minimum. Reason?",
	"options": [
	"Poor initialization of centroids",
	"Algorithm always finds global minimum",
	"Clusters are too compact",
	"Distance metric is incorrect"
	],
	"correctAnswerIndex": 0,
	"explanation": "Random initial centroids can lead K-Means to converge to suboptimal local minima."
	},
	{
	"id": 48,
	"questionText": "Scenario: K-Means clustering with overlapping clusters. Limitation?",
	"options": [
	"Clusters merge automatically",
	"Algorithm adapts perfectly",
	"Cannot clearly separate overlapping clusters",
	"Noise ignored"
	],
	"correctAnswerIndex": 2,
	"explanation": "K-Means relies on distance; overlapping clusters may not be correctly assigned."
	},
	{
	"id": 49,
	"questionText": "Scenario: K-Means for market segmentation. Use case?",
	"options": [
	"Remove outliers automatically",
	"Identify customer groups for targeted campaigns",
	"Predict stock prices",
	"Visualize time series"
	],
	"correctAnswerIndex": 1,
	"explanation": "K-Means clusters similar customers to enable targeted marketing strategies."
	},
	{
	"id": 50,
	"questionText": "Scenario: K-Means for anomaly detection in credit card transactions. Approach?",
	"options": [
	"Transactions far from cluster centroids may be fraudulent",
	"All transactions treated equally",
	"Clusters merge automatically",
	"Noise ignored"
	],
	"correctAnswerIndex": 0,
	"explanation": "Outliers distant from normal clusters can indicate anomalous or fraudulent activity."
	},
	{
	"id": 51,
	"questionText": "Scenario: K-Means on high-dimensional gene expression data. Best practice?",
	"options": [
	"Clusters merge randomly",
	"Use raw high-dimensional data directly",
	"Increase K arbitrarily",
	"Use PCA or dimensionality reduction before clustering"
	],
	"correctAnswerIndex": 3,
	"explanation": "Dimensionality reduction helps meaningful clustering and avoids distance dilution."
	},
	{
	"id": 52,
	"questionText": "Scenario: K-Means on very large dataset. Speed-up technique?",
	"options": [
	"Mini-Batch K-Means",
	"Increase K",
	"Ignore convergence",
	"Use raw data"
	],
	"correctAnswerIndex": 0,
	"explanation": "Mini-Batch K-Means updates centroids using batches, reducing computation time."
	},
	{
	"id": 53,
	"questionText": "Scenario: K-Means applied to IoT sensor data with missing values. Solution?",
	"options": [
	"Impute missing values before clustering",
	"Remove entire dataset",
	"Assign clusters randomly",
	"Ignore missing values"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means requires complete numerical data; missing values must be handled prior to clustering."
	},
	{
	"id": 54,
	"questionText": "Scenario: K-Means applied to customer purchase history. Challenge?",
	"options": [
	"Clusters automatically balanced",
	"Sparse purchase data may lead to poor cluster separation",
	"Algorithm converges perfectly",
	"Noise ignored"
	],
	"correctAnswerIndex": 1,
	"explanation": "Sparse or high-dimensional data can reduce clustering accuracy; preprocessing helps."
	},
	{
	"id": 55,
	"questionText": "Scenario: K-Means with categorical features encoded as numbers. Risk?",
	"options": [
	"Algorithm works perfectly",
	"Clusters merge automatically",
	"Noise ignored",
	"Numerical encoding may introduce artificial distance relationships"
	],
	"correctAnswerIndex": 3,
	"explanation": "Direct numeric encoding of categorical data can misrepresent similarity between categories."
	},
	{
	"id": 56,
	"questionText": "Scenario: K-Means for spatial clustering of stores. Best practice?",
	"options": [
	"Increase K arbitrarily",
	"Clusters merge randomly",
	"Use raw coordinates directly",
	"Normalize coordinates or use appropriate distance metric"
	],
	"correctAnswerIndex": 3,
	"explanation": "Scaling ensures coordinates are comparable and distance computations are accurate."
	},
	{
	"id": 57,
	"questionText": "Scenario: K-Means produces poor clustering. Possible reason?",
	"options": [
	"Data not suitable for K-Means (non-spherical or overlapping)",
	"Centroids incorrect",
	"Algorithm always finds perfect clusters",
	"Distance metric irrelevant"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means struggles with non-spherical or overlapping clusters."
	},
	{
	"id": 58,
	"questionText": "Scenario: K-Means clustering on scaled features. Advantage?",
	"options": [
	"Distance metric changes",
	"Clusters merge automatically",
	"Prevents dominance by features with large range",
	"Algorithm ignores scaling"
	],
	"correctAnswerIndex": 2,
	"explanation": "Scaling ensures each feature contributes equally to Euclidean distance calculations."
	},
	{
	"id": 59,
	"questionText": "Scenario: K-Means with clusters of unequal variance. Issue?",
	"options": [
	"Noise ignored",
	"Algorithm automatically adjusts",
	"Clusters always compact",
	"Clusters may not accurately represent data structure"
	],
	"correctAnswerIndex": 3,
	"explanation": "K-Means assumes similar variance; large differences affect cluster quality."
	},
	{
	"id": 60,
	"questionText": "Scenario: K-Means applied to time-series data. Approach?",
	"options": [
	"Use raw sequences directly",
	"Increase K arbitrarily",
	"Clusters merge randomly",
	"Extract meaningful features before clustering"
	],
	"correctAnswerIndex": 3,
	"explanation": "Feature extraction ensures distance metrics are meaningful for time-series clustering."
	},
	{
	"id": 61,
	"questionText": "Scenario: K-Means clusters overlap. Evaluation metric?",
	"options": [
	"Use SSE only",
	"Clusters merge randomly",
	"Ignore overlap",
	"Silhouette score measures separation and cohesion"
	],
	"correctAnswerIndex": 3,
	"explanation": "Silhouette score evaluates how well points fit within their clusters vs others."
	},
	{
	"id": 62,
	"questionText": "Scenario: K-Means with too few clusters. Result?",
	"options": [
	"Clusters may merge dissimilar points, reducing interpretability",
	"Algorithm adapts automatically",
	"Clusters always compact",
	"Noise ignored"
	],
	"correctAnswerIndex": 0,
	"explanation": "Too small K forces dissimilar points into same cluster, reducing accuracy."
	},
	{
	"id": 63,
	"questionText": "Scenario: K-Means for market basket analysis. Limitation?",
	"options": [
	"Algorithm works perfectly",
	"Noise ignored",
	"Clusters merge automatically",
	"Sparse and categorical data requires encoding or alternate methods"
	],
	"correctAnswerIndex": 3,
	"explanation": "Sparse categorical data needs careful preprocessing or K-Prototypes instead of K-Means."
	},
	{
	"id": 64,
	"questionText": "Scenario: K-Means produces empty clusters. Solution?",
	"options": [
	"Ignore empty clusters",
	"Algorithm fails automatically",
	"Increase K randomly",
	"Reinitialize centroids or reduce K"
	],
	"correctAnswerIndex": 3,
	"explanation": "Reassigning centroids or reducing K resolves empty clusters."
	},
	{
	"id": 65,
	"questionText": "Scenario: K-Means with high-dimensional data. Challenge?",
	"options": [
	"Distance metrics lose meaning; reduce dimensions",
	"Algorithm adapts automatically",
	"Clusters always accurate",
	"Noise ignored"
	],
	"correctAnswerIndex": 0,
	"explanation": "High dimensions dilute distances, making clustering unreliable without dimensionality reduction."
	},
	{
	"id": 66,
	"questionText": "Scenario: K-Means on normalized vs unnormalized features. Effect?",
	"options": [
	"Algorithm automatically scales",
	"Normalization ensures fair distance contribution across features",
	"Unnormalized always better",
	"Clusters merge randomly"
	],
	"correctAnswerIndex": 1,
	"explanation": "Normalized features prevent features with large ranges from dominating clustering."
	},
	{
	"id": 67,
	"questionText": "Scenario: K-Means++ vs random initialization. Advantage?",
	"options": [
	"Improves clustering stability and convergence",
	"Random initialization always better",
	"No difference in results",
	"Removes noise automatically"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means++ selects initial centroids to reduce poor local minima."
	},
	{
	"id": 68,
	"questionText": "Scenario: K-Means on customer purchase amounts. Data skewed. Solution?",
	"options": [
	"Use raw data",
	"Log-transform or scale data before clustering",
	"Increase K",
	"Ignore skew"
	],
	"correctAnswerIndex": 1,
	"explanation": "Transforming skewed data prevents high-value points from dominating clustering."
	},
	{
	"id": 69,
	"questionText": "Scenario: K-Means on text data after TF-IDF. Challenge?",
	"options": [
	"Algorithm works perfectly",
	"High-dimensional sparse vectors; dimensionality reduction recommended",
	"Noise ignored",
	"Clusters merge randomly"
	],
	"correctAnswerIndex": 1,
	"explanation": "Sparse high-dimensional TF-IDF vectors may reduce clustering effectiveness without reduction."
	},
	{
	"id": 70,
	"questionText": "Scenario: K-Means applied to IoT device readings. Best practice?",
	"options": [
	"Increase K randomly",
	"Use raw readings",
	"Ignore convergence",
	"Normalize or scale features to ensure meaningful clustering"
	],
	"correctAnswerIndex": 3,
	"explanation": "Scaling ensures that features contribute equally to distance calculations for clustering."
	},
	{
	"id": 71,
	"questionText": "Scenario: K-Means clustering applied to gene expression data with thousands of features. Best approach?",
	"options": [
	"Apply PCA or feature selection to reduce dimensionality before clustering",
	"Randomly remove features",
	"Increase K arbitrarily",
	"Use all features directly"
	],
	"correctAnswerIndex": 0,
	"explanation": "High-dimensional gene data can dilute distances; dimensionality reduction ensures meaningful clusters."
	},
	{
	"id": 72,
	"questionText": "Scenario: K-Means with very large K relative to dataset size. Risk?",
	"options": [
	"Clusters may be meaningless or empty",
	"Algorithm automatically adjusts",
	"Distance metric ignored",
	"Clusters merge automatically"
	],
	"correctAnswerIndex": 0,
	"explanation": "Too many clusters can lead to tiny or empty clusters with no interpretability."
	},
	{
	"id": 73,
	"questionText": "Scenario: K-Means on data with non-uniform density clusters. Limitation?",
	"options": [
	"Noise ignored",
	"Low-density clusters may merge with high-density ones",
	"Algorithm adjusts automatically",
	"Clusters always compact"
	],
	"correctAnswerIndex": 1,
	"explanation": "K-Means does not handle varying densities well; denser clusters dominate centroid assignment."
	},
	{
	"id": 74,
	"questionText": "Scenario: K-Means on highly skewed financial transaction data. Best preprocessing?",
	"options": [
	"Apply log transformation to reduce skew before clustering",
	"Use raw data",
	"Clusters merge randomly",
	"Increase K arbitrarily"
	],
	"correctAnswerIndex": 0,
	"explanation": "Log or other transformations reduce the effect of extreme values, improving clustering quality."
	},
	{
	"id": 75,
	"questionText": "Scenario: K-Means on time-series data. Effective method?",
	"options": [
	"Extract meaningful features such as trends or seasonal components before clustering",
	"Use raw sequences directly",
	"Clusters merge automatically",
	"Increase K randomly"
	],
	"correctAnswerIndex": 0,
	"explanation": "Feature extraction ensures distances reflect meaningful similarities in time-series."
	},
	{
	"id": 76,
	"questionText": "Scenario: K-Means clustering for anomaly detection in network traffic. Strategy?",
	"options": [
	"All points treated equally",
	"Points far from cluster centroids are likely anomalies",
	"Noise ignored",
	"Clusters merge automatically"
	],
	"correctAnswerIndex": 1,
	"explanation": "Outliers distant from normal traffic clusters are potential anomalies."
	},
	{
	"id": 77,
	"questionText": "Scenario: K-Means applied to image color compression. Challenge?",
	"options": [
	"Algorithm automatically selects K",
	"All clusters identical",
	"Noise ignored",
	"Choosing optimal K to balance compression and image quality"
	],
	"correctAnswerIndex": 3,
	"explanation": "Selecting K is critical; too few clusters lose color details, too many reduce compression."
	},
	{
	"id": 78,
	"questionText": "Scenario: K-Means++ vs multiple random initializations. Advantage of K-Means++?",
	"options": [
	"Reduces likelihood of poor local minima and improves convergence",
	"Removes noise automatically",
	"Random initializations are better",
	"No difference in results"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means++ selects initial centroids that are distant, improving stability and cluster quality."
	},
	{
	"id": 79,
	"questionText": "Scenario: K-Means applied to sparse TF-IDF text vectors. Best approach?",
	"options": [
	"Use raw sparse vectors directly",
	"Increase K arbitrarily",
	"Reduce dimensionality using techniques like Truncated SVD before clustering",
	"Clusters merge randomly"
	],
	"correctAnswerIndex": 2,
	"explanation": "High-dimensional sparse data may produce poor clusters; dimensionality reduction improves performance."
	},
	{
	"id": 80,
	"questionText": "Scenario: K-Means clustering with overlapping spherical clusters. How to improve?",
	"options": [
	"K-Means always works",
	"Clusters merge automatically",
	"Reduce K randomly",
	"Use Gaussian Mixture Models (GMM) for soft clustering"
	],
	"correctAnswerIndex": 3,
	"explanation": "GMM can model cluster overlap using probability distributions, unlike hard K-Means assignments."
	},
	{
	"id": 81,
	"questionText": "Scenario: K-Means for customer segmentation with categorical attributes. Best practice?",
	"options": [
	"Use K-Prototypes or encode categories numerically",
	"Ignore categorical data",
	"Use standard K-Means directly",
	"Clusters merge randomly"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Prototypes handles mixed numerical and categorical data effectively."
	},
	{
	"id": 82,
	"questionText": "Scenario: K-Means convergence to local minimum. Cause?",
	"options": [
	"Distance metric incorrect",
	"Clusters too compact",
	"Poor or random initialization of centroids",
	"Algorithm always finds global minimum"
	],
	"correctAnswerIndex": 2,
	"explanation": "K-Means may converge to suboptimal solutions depending on initial centroids."
	},
	{
	"id": 83,
	"questionText": "Scenario: K-Means applied to geospatial clustering. Recommendation?",
	"options": [
	"Increase K randomly",
	"Use appropriate distance metrics like haversine for coordinates",
	"Clusters merge arbitrarily",
	"Use Euclidean distance blindly"
	],
	"correctAnswerIndex": 1,
	"explanation": "Geographic distances require correct metric to ensure accurate clustering."
	},
	{
	"id": 84,
	"questionText": "Scenario: K-Means with very large datasets. Efficient solution?",
	"options": [
	"Use Mini-Batch K-Means",
	"Increase K arbitrarily",
	"Ignore convergence",
	"Use full dataset only"
	],
	"correctAnswerIndex": 0,
	"explanation": "Mini-Batch K-Means speeds up computation by using small random batches for centroid updates."
	},
	{
	"id": 85,
	"questionText": "Scenario: K-Means on noisy IoT sensor data. Best preprocessing?",
	"options": [
	"Filter or smooth noise before clustering",
	"Clusters merge automatically",
	"Increase K arbitrarily",
	"Use raw data"
	],
	"correctAnswerIndex": 0,
	"explanation": "Noise can distort centroids; preprocessing improves clustering reliability."
	},
	{
	"id": 86,
	"questionText": "Scenario: K-Means on very high-dimensional data. Limitation?",
	"options": [
	"Distance metrics lose meaning; dimensionality reduction recommended",
	"Noise ignored",
	"Clusters always accurate",
	"Algorithm adapts automatically"
	],
	"correctAnswerIndex": 0,
	"explanation": "High-dimensional spaces dilute distances, leading to poor cluster assignments."
	},
	{
	"id": 87,
	"questionText": "Scenario: K-Means for anomaly detection in healthcare data. Approach?",
	"options": [
	"Noise ignored",
	"Points far from cluster centroids may indicate anomalies",
	"Clusters merge automatically",
	"All points treated equally"
	],
	"correctAnswerIndex": 1,
	"explanation": "Outliers distant from normal clusters can indicate anomalies or rare events."
	},
	{
	"id": 88,
	"questionText": "Scenario: K-Means on image segmentation with varying illumination. Challenge?",
	"options": [
	"Preprocessing like normalization is needed to reduce lighting effect",
	"Algorithm works perfectly",
	"Increase K randomly",
	"Clusters merge automatically"
	],
	"correctAnswerIndex": 0,
	"explanation": "Differences in lighting affect pixel values; normalization improves clustering consistency."
	},
	{
	"id": 89,
	"questionText": "Scenario: K-Means for market segmentation with mixed purchase behavior. Solution?",
	"options": [
	"Ignore categorical data",
	"Clusters merge randomly",
	"Use numerical encoding or K-Prototypes for categorical and numerical features",
	"Use standard K-Means directly"
	],
	"correctAnswerIndex": 2,
	"explanation": "Mixed data requires specialized clustering methods for meaningful segmentation."
	},
	{
	"id": 90,
	"questionText": "Scenario: K-Means clustering produces empty clusters repeatedly. Best solution?",
	"options": [
	"Algorithm fails automatically",
	"Increase K arbitrarily",
	"Ignore empty clusters",
	"Reinitialize centroids or reduce K"
	],
	"correctAnswerIndex": 3,
	"explanation": "Empty clusters occur when centroids have no assigned points; reinitialization or lowering K resolves this."
	},
	{
	"id": 91,
	"questionText": "Scenario: K-Means applied to highly imbalanced datasets. Issue?",
	"options": [
	"Large clusters may dominate, small clusters underrepresented",
	"Clusters always balanced",
	"Noise ignored",
	"Algorithm adapts automatically"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means does not account for cluster size; imbalance may distort results."
	},
	{
	"id": 92,
	"questionText": "Scenario: K-Means applied to network traffic logs for intrusion detection. Best approach?",
	"options": [
	"Use raw logs directly",
	"Increase K arbitrarily",
	"Preprocess logs into numerical features and detect points far from centroids",
	"Clusters merge automatically"
	],
	"correctAnswerIndex": 2,
	"explanation": "Transforming logs to numerical vectors enables clustering and anomaly detection."
	},
	{
	"id": 93,
	"questionText": "Scenario: K-Means clustering with multiple valid K values. Evaluation metric?",
	"options": [
	"Silhouette score to evaluate cluster quality",
	"Ignore K selection",
	"Use SSE only",
	"Clusters merge randomly"
	],
	"correctAnswerIndex": 0,
	"explanation": "Silhouette score measures cohesion and separation, helping choose optimal K."
	},
	{
	"id": 94,
	"questionText": "Scenario: K-Means applied to text clustering using word embeddings. Limitation?",
	"options": [
	"High-dimensional vectors may require dimensionality reduction or normalization",
	"Clusters merge randomly",
	"Algorithm works perfectly",
	"Noise ignored"
	],
	"correctAnswerIndex": 0,
	"explanation": "Dimensionality reduction and normalization improve clustering accuracy for embeddings."
	},
	{
	"id": 95,
	"questionText": "Scenario: K-Means clustering results vary on repeated runs. Best solution?",
	"options": [
	"Use K-Means++ initialization or multiple runs",
	"Clusters merge randomly",
	"Ignore variations",
	"Increase K arbitrarily"
	],
	"correctAnswerIndex": 0,
	"explanation": "Better initialization reduces sensitivity to random centroid placement."
	},
	{
	"id": 96,
	"questionText": "Scenario: K-Means on scaled features vs unscaled features. Observation?",
	"options": [
	"Scaling ensures fair contribution of all features to distance calculation",
	"Clusters merge randomly",
	"Algorithm adapts automatically",
	"Scaling is unnecessary"
	],
	"correctAnswerIndex": 0,
	"explanation": "Without scaling, features with larger ranges dominate cluster assignments."
	},
	{
	"id": 97,
	"questionText": "Scenario: K-Means clustering on overlapping clusters. Alternative?",
	"options": [
	"Reduce K randomly",
	"Clusters merge automatically",
	"Use soft clustering like Gaussian Mixture Models",
	"K-Means handles overlap perfectly"
	],
	"correctAnswerIndex": 2,
	"explanation": "Soft clustering models allow points to belong probabilistically to multiple clusters."
	},
	{
	"id": 98,
	"questionText": "Scenario: K-Means applied to sensor network data with missing values. Solution?",
	"options": [
	"Impute missing values before clustering",
	"Assign clusters randomly",
	"Remove entire dataset",
	"Ignore missing values"
	],
	"correctAnswerIndex": 0,
	"explanation": "K-Means requires complete data; missing values must be handled prior to clustering."
	},
	{
	"id": 99,
	"questionText": "Scenario: K-Means on customer behavior data with high variance features. Best approach?",
	"options": [
	"Increase K arbitrarily",
	"Clusters merge randomly",
	"Use raw data",
	"Scale or normalize features to prevent dominance by high-variance features"
	],
	"correctAnswerIndex": 3,
	"explanation": "Scaling ensures fair contribution of each feature to distance computation."
	},
	{
	"id": 100,
	"questionText": "Scenario: K-Means applied to a large dataset with many outliers. Recommendation?",
	"options": [
	"Increase K arbitrarily",
	"Clusters merge automatically",
	"Use raw data directly",
	"Preprocess to remove or handle outliers before clustering"
	],
	"correctAnswerIndex": 3,
	"explanation": "Outliers distort centroids; preprocessing ensures meaningful cluster assignments."
	}
	]
	}