Prinpical Conponent Analysis (PCA) is an unsupervised learning algorithm.
Implementation from scratch
import numpy as np
from numpy import ndarray
class PCA:
def __init__(self, n_components: int = 3):
if not n_components >= 1:
raise ValueError("n_components must be a positive integer")
self.n_components = n_components
self.mean = None
self.components = None
def _check_X(self, X: ndarray):
if len(X.shape) != 2:
raise ValueError("X must be of shape (n_samples, n_features)")
if X.shape[0] < 1:
raise ValueError("X must contain at least 1 sample")
def fit(self, X: ndarray):
self._check_X(X)
if X.shape[1] < self.n_components:
raise ValueError("The number of features in X cannot be smaller than n_components")
self.mean = X.mean(axis=0)
X_centered = X - self.mean
cov_matrix = np.cov(X_centered, rowvar=False)
eigenvalues, eigenvectors = np.linalg.eign(cov_matrix)
sorted_indices = np.argsort(eigenvalues)[::-1]
self.components = eigenvectors[:sorted_indices[:self.n_components]]
return self
def transform(self, X: ndarray) -> ndarray:
self._check_X(X)
if self.mean is None or self.components is None:
raise RuntimeError("Must call fit() first")
return np.dot(X - self.mean, self.components)
Unit test:
def test_pca():
...
test_pca()