From 26c8803281bd575edb669cd285452075cc073fb8 Mon Sep 17 00:00:00 2001 From: Vishal Patil Date: Mon, 1 Apr 2024 20:19:08 -0400 Subject: [PATCH] Inital commit --- README.md | 5 + anamoly-detection/Cargo.toml | 13 ++ anamoly-detection/README.md | 3 + anamoly-detection/src/main.rs | 100 +++++++++++++ k-means/Cargo.toml | 13 ++ k-means/README.md | 3 + k-means/src/main.rs | 106 ++++++++++++++ linear-regression/Cargo.toml | 14 ++ linear-regression/README.md | 7 + linear-regression/src/main.rs | 245 ++++++++++++++++++++++++++++++++ logistic-regression/Cargo.toml | 15 ++ logistic-regression/README.md | 7 + logistic-regression/src/main.rs | 177 +++++++++++++++++++++++ neural-networks/Cargo.toml | 14 ++ neural-networks/README.md | 3 + neural-networks/src/main.rs | 135 ++++++++++++++++++ pca/Cargo.toml | 13 ++ pca/README.md | 3 + pca/src/main.rs | 100 +++++++++++++ recommender-system/Cargo.toml | 13 ++ recommender-system/README.md | 3 + recommender-system/src/main.rs | 234 ++++++++++++++++++++++++++++++ 22 files changed, 1226 insertions(+) create mode 100644 README.md create mode 100644 anamoly-detection/Cargo.toml create mode 100644 anamoly-detection/README.md create mode 100644 anamoly-detection/src/main.rs create mode 100644 k-means/Cargo.toml create mode 100644 k-means/README.md create mode 100644 k-means/src/main.rs create mode 100644 linear-regression/Cargo.toml create mode 100644 linear-regression/README.md create mode 100644 linear-regression/src/main.rs create mode 100644 logistic-regression/Cargo.toml create mode 100644 logistic-regression/README.md create mode 100644 logistic-regression/src/main.rs create mode 100644 neural-networks/Cargo.toml create mode 100644 neural-networks/README.md create mode 100644 neural-networks/src/main.rs create mode 100644 pca/Cargo.toml create mode 100644 pca/README.md create mode 100644 pca/src/main.rs create mode 100644 recommender-system/Cargo.toml create mode 100644 recommender-system/README.md create mode 100644 recommender-system/src/main.rs diff --git a/README.md b/README.md new file mode 100644 index 0000000..7f7b6bb --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +## Machine Learning with Rust using Candle + +This repository features implementations of algorithms from the Stanford University [Machine Learning Course](https://www.youtube.com/@machinelearningandai3274), all crafted in Rust using the [Candle](https://github.com/huggingface/candle) crate. Each example leverages diverse datasets from Kaggle to demonstrate the algorithms' applications. + +All of the examples are CUDA enabled but can run on a machine without a GPU as well, though might be quite slow. \ No newline at end of file diff --git a/anamoly-detection/Cargo.toml b/anamoly-detection/Cargo.toml new file mode 100644 index 0000000..530a098 --- /dev/null +++ b/anamoly-detection/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "anamoly-detection" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +csv = "1.1.6" +anyhow = "1.0.40" +clap = {version = "4.3.1", features = ["derive"]} +rand = "0.8.5" +candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] } \ No newline at end of file diff --git a/anamoly-detection/README.md b/anamoly-detection/README.md new file mode 100644 index 0000000..9c5a33b --- /dev/null +++ b/anamoly-detection/README.md @@ -0,0 +1,3 @@ +# Anamoly Detection + +[Anamoly Detection](https://youtu.be/UqqPm-Q4aMo?si=TCZFJOJv94R1i71u) using Gaussian Distribution for the Kaggle [EECS 498 dataset](https://www.kaggle.com/c/eecs498/data). \ No newline at end of file diff --git a/anamoly-detection/src/main.rs b/anamoly-detection/src/main.rs new file mode 100644 index 0000000..75f1d15 --- /dev/null +++ b/anamoly-detection/src/main.rs @@ -0,0 +1,100 @@ +extern crate csv; +use std::vec; + +use anyhow::Result; +use candle_core::{Device, Tensor}; +use clap::Parser; + +fn load_dataset(file_path: &str, device: &Device) -> Result { + let mut rdr = csv::Reader::from_path(file_path)?; + let mut data = Vec::new(); + for result in rdr.records() { + let record = result?; + let mut row = vec![]; + for i in 1..4 { + row.push(record[i].parse::()?); + } + data.push(row); + } + let feature_cnt = data[0].len(); + let sample_cnt = data.len(); + let data = data.into_iter().flatten().collect::>(); + let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?; + Ok(data) +} + +fn z_score_normalize(data: &Tensor) -> Result { + let mean = data.mean(0)?; + let squared_diff = data.broadcast_sub(&mean)?.sqr()?; + let variance = squared_diff.mean(0)?; + let std_dev = variance.sqrt()?; + let normalized = data.broadcast_sub(&mean)?.broadcast_div(&std_dev)?; + Ok(normalized) +} + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + // Data CSV file from https://www.kaggle.com/c/eecs498/data + #[arg(long)] + data_csv: String, + + #[arg(long, short, default_value = "false")] + print: bool, + + #[arg(long, default_value = "0.001")] + episilon: f64, +} + +fn p_x( + x: &Tensor, + mean: &Tensor, + two_variance: &Tensor, + two_pi_sqrt_std_dev: &Tensor, +) -> Result { + let px = x + .broadcast_sub(mean)? + .sqr()? + .broadcast_div(two_variance)? + .exp()? + .broadcast_mul(two_pi_sqrt_std_dev)? + .recip()?; + let px = px.to_vec1::()?.into_iter().fold(1.0, |acc, x| acc * x); + Ok(px) +} + +fn main() -> Result<()> { + let args = Args::parse(); + + let device = Device::cuda_if_available(0)?; + let data = load_dataset(&args.data_csv, &device)?; + + let data = z_score_normalize(&data)?; + + let mean = data.mean(0)?; + let variance = data.broadcast_sub(&mean)?.sqr()?.mean(0)?; + let std_dev = variance.sqrt()?; + + let two_variance = variance.broadcast_mul(&Tensor::new(2.0, &device)?)?; + let two_pi_sqrt_std_dev = + std_dev.broadcast_mul(&Tensor::new(2.0 * std::f64::consts::PI, &device)?.sqrt()?)?; + + let rows = data.shape().dims2()?.0; + let mut anamolies = 0; + for row in 0..rows { + let row_tensor = data + .index_select(&Tensor::new(&[row as u32], &device)?, 0)? + .squeeze(0)?; + let px = p_x(&row_tensor, &mean, &two_variance, &two_pi_sqrt_std_dev)?; + if px < args.episilon { + anamolies += 1; + if args.print { + println!("Anamoly: {}", row + 1); + } + } + } + + println!("Anamolies: {}, Total: {}", anamolies, rows); + + Ok(()) +} diff --git a/k-means/Cargo.toml b/k-means/Cargo.toml new file mode 100644 index 0000000..480078f --- /dev/null +++ b/k-means/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "k-means" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +csv = "1.1.6" +anyhow = "1.0.40" +clap = {version = "4.3.1", features = ["derive"]} +rand = "0.8.5" +candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] } \ No newline at end of file diff --git a/k-means/README.md b/k-means/README.md new file mode 100644 index 0000000..736877d --- /dev/null +++ b/k-means/README.md @@ -0,0 +1,3 @@ +# K-Means clustering + +[K-Means clustering](https://youtu.be/0D4LnsJr85Y?si=qmkMIqY39rMJUBDk) for the Kaggle [Irisi dataset](https://www.kaggle.com/datasets/uciml/iris/data) \ No newline at end of file diff --git a/k-means/src/main.rs b/k-means/src/main.rs new file mode 100644 index 0000000..680bea8 --- /dev/null +++ b/k-means/src/main.rs @@ -0,0 +1,106 @@ +extern crate csv; +use std::vec; + +use anyhow::Result; +use candle_core::{DType, Device, Tensor, D}; +use clap::Parser; +use rand::prelude::*; + +fn cdist(x1: &Tensor, x2: &Tensor) -> Result { + let x1 = x1.unsqueeze(0)?; + let x2 = x2.unsqueeze(1)?; + Ok(x1 + .broadcast_sub(&x2)? + .sqr()? + .sum(D::Minus1)? + .sqrt()? + .transpose(D::Minus1, D::Minus2)?) +} + +fn load_dataset(file_path: &str, device: &Device) -> Result { + let mut rdr = csv::Reader::from_path(file_path)?; + let mut data = Vec::new(); + for result in rdr.records() { + let record = result?; + let mut row = vec![]; + for i in 1..5 { + row.push(record[i].parse::()?); + } + data.push(row); + } + let feature_cnt = data[0].len(); + let sample_cnt = data.len(); + let data = data.into_iter().flatten().collect::>(); + let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?; + Ok(data) +} + +fn k_means(data: &Tensor, k: usize, max_iter: i64, device: &Device) -> Result<(Tensor, Tensor)> { + let (n, _) = data.dims2()?; + let mut rng = rand::thread_rng(); + let mut indices = (0..n).collect::>(); + indices.shuffle(&mut rng); + + let centroid_idx = indices[..k] + .iter() + .copied() + .map(|x| x as i64) + .collect::>(); + + let centroid_idx_tensor = Tensor::from_slice(centroid_idx.as_slice(), (k,), device)?; + let mut centers = data.index_select(¢roid_idx_tensor, 0)?; + let mut cluster_assignments = Tensor::zeros((n,), DType::U32, device)?; + for _ in 0..max_iter { + let dist = cdist(data, ¢ers)?; + cluster_assignments = dist.argmin(D::Minus1)?; + let mut centers_vec = vec![]; + for i in 0..k { + let mut indices = vec![]; + cluster_assignments + .to_vec1::()? + .iter() + .enumerate() + .for_each(|(j, x)| { + if *x == i as u32 { + indices.push(j as u32); + } + }); + let indices = Tensor::from_slice(indices.as_slice(), (indices.len(),), device)?; + let cluster_data = data.index_select(&indices, 0)?; + let mean = cluster_data.mean(0)?; + centers_vec.push(mean); + } + centers = Tensor::stack(centers_vec.as_slice(), 0)?; + } + Ok((centers, cluster_assignments)) +} + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + // Data CSV file from https://www.kaggle.com/datasets/uciml/iris/data + #[arg(long)] + data_csv: String, + + // Number of clusters + #[arg(long, default_value = "3")] + k: usize, + + // Maximum number of iterations + #[arg(long, default_value = "100")] + max_iter: i64, +} +fn main() -> Result<()> { + let args = Args::parse(); + let device = Device::cuda_if_available(0)?; + let data = load_dataset(&args.data_csv, &device).unwrap(); + let (centers, cluster_assignments) = k_means(&data, args.k, args.max_iter, &device)?; + println!("{}", centers); + println!("{}", cluster_assignments); + let cluster_sizes = cluster_assignments.to_vec1::()?; + for i in 0..args.k { + let size = cluster_sizes.iter().filter(|&&x| x == i as u32).count(); + println!("Cluster {} size: {}", i, size); + } + Ok(()) +} diff --git a/linear-regression/Cargo.toml b/linear-regression/Cargo.toml new file mode 100644 index 0000000..86f312b --- /dev/null +++ b/linear-regression/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "linear-regression" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +csv = "1.1.6" +ndarray = "0.15.3" +candle = { git = "https://github.com/huggingface/candle", package = "candle-core", features = ["cuda"]} +anyhow = "1.0.40" +clap = { version = "4.5.1", features = ["derive"] } +rand = "0.8.5" diff --git a/linear-regression/README.md b/linear-regression/README.md new file mode 100644 index 0000000..1c72ac1 --- /dev/null +++ b/linear-regression/README.md @@ -0,0 +1,7 @@ +# Linear regression + +[Linear regression](https://youtu.be/W46UTQ_JDPk?si=dfz9_kFBUkM3E1RR) (with regularization) model using gradient descent implemented for the Kaggle [insurance dataset](https://www.kaggle.com/code/kianwee/linear-regression-insurance-dataset). + +```bash +cargo run -- --data-csv ./insurance.csv +``` diff --git a/linear-regression/src/main.rs b/linear-regression/src/main.rs new file mode 100644 index 0000000..e2b5c18 --- /dev/null +++ b/linear-regression/src/main.rs @@ -0,0 +1,245 @@ +extern crate csv; +use anyhow::Result; +use candle::{Device, Tensor, D}; +use clap::Parser; +use core::panic; +use rand::prelude::*; +use std::fs::File; +use std::rc::Rc; + +struct Dataset { + pub training_data: Tensor, + pub training_labels: Tensor, + pub test_data: Tensor, + pub test_labels: Tensor, + pub feature_cnt: usize, +} + +// Implement Linear Regression model using Gradient Descent +// https://www.youtube.com/watch?v=UVCFaaEBnTE +struct LinearRegression { + weights: Tensor, + bias: Tensor, + device: Rc, +} + +impl LinearRegression { + fn new(feature_cnt: usize, device: Rc) -> Result { + let weights: Vec = vec![0.0; feature_cnt]; + let weights = Tensor::from_vec(weights, (feature_cnt,), &device)?; + let bias = Tensor::new(0.0f32, &device)?; + Ok(Self { + weights, + bias, + device, + }) + } + + fn hypothesis(&self, x: &Tensor) -> Result { + Ok(x.matmul(&self.weights.unsqueeze(1)?)? + .squeeze(1)? + .broadcast_add(&self.bias)?) + } + + fn loss(&self, y1: &Tensor, y2: &Tensor) -> Result { + let diff = y1.sub(y2)?; + let loss = diff.mul(&diff)?.mean(D::Minus1)?.to_scalar::()?; + Ok(loss) + } + + fn train( + &mut self, + x: &Tensor, + y: &Tensor, + learning_rate: f32, + regularization: f32, + ) -> Result<()> { + let m = y.shape().dims1()?; + let predictions = self.hypothesis(x)?; + let deltas = predictions.sub(y)?; + let regularization = self + .weights + .broadcast_mul(&Tensor::new(regularization / m as f32, &self.device)?)?; + + let gradient = x + .t()? + .matmul(&deltas.unsqueeze(D::Minus1)?)? + .broadcast_div(&Tensor::new(m as f32, &self.device)?)?; + let gradient = gradient + .squeeze(D::Minus1)? + .squeeze(D::Minus1)? + .add(®ularization)?; + self.weights = self + .weights + .sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?; + let gradient = deltas.mean(D::Minus1)?; + self.bias = self + .bias + .sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?; + Ok(()) + } +} + +fn r2_score(predictions: &Tensor, labels: &Tensor) -> Result> { + let mean = labels.mean(D::Minus1)?; + + let ssr = labels.sub(predictions)?; + let ssr = ssr.mul(&ssr)?.sum(D::Minus1)?; + + let sst = labels.broadcast_sub(&mean)?; + let sst = sst.mul(&sst)?.sum(D::Minus1)?; + + let tmp = ssr.div(&sst)?.to_scalar::()?; + + Ok(1.0 - tmp) +} + +const BATCH_SIZE: usize = 100; + +fn insurance_dataset(file_path: &str, device: &Device) -> Result { + // https://www.kaggle.com/mirichoi0218/insurance + + let file = File::open(file_path)?; + let mut rdr = csv::Reader::from_reader(file); + let mut data: Vec> = vec![]; + let mut labels: Vec = vec![]; + + const FEATURE_CNT: usize = 6; + const MALE: f32 = 0.5; + const FEMALE: f32 = -0.5; + + const YES: f32 = 0.5; + const NO: f32 = -0.5; + + const NORTHWEST: f32 = 0.25; + const NORTHEAST: f32 = -0.25; + const SOUTHWEST: f32 = 0.5; + const SOUTHEAST: f32 = -0.5; + + for result in rdr.records() { + let record = result?; + let age: f32 = (record[0].parse::()? as f32) / 100.0; + let gender = match record[1].parse::()?.as_str() { + "male" => MALE, + "female" => FEMALE, + _ => panic!("Invalid Gender"), + }; + let bmi: f32 = record[2].parse::()? / 100.0; + let children: f32 = record[3].parse()?; + let smoker = match record[4].parse::()?.as_str() { + "yes" => YES, + "no" => NO, + _ => panic!("Invalid Smoker"), + }; + let region = match record[5].parse::()?.as_str() { + "northwest" => NORTHWEST, + "northeast" => NORTHEAST, + "southwest" => SOUTHWEST, + "southeast" => SOUTHEAST, + _ => panic!("Invalid Region"), + }; + let charges: f32 = record[6].parse()?; + + let row = vec![age, gender, bmi, children, smoker, region]; + data.push(row); + + let label = charges; + labels.push(label); + } + let training_size = labels.len() * 8 / 10; + let training_data = data[..training_size].to_vec(); + let training_labels = labels[..training_size].to_vec(); + + let training_data = training_data + .iter() + .flatten() + .copied() + .collect::>(); + let training_data_tensor = + Tensor::from_slice(&training_data, (training_labels.len(), FEATURE_CNT), device)?; + let training_labels_tensor = + Tensor::from_slice(&training_labels, (training_labels.len(),), device)?; + + let test_data = data[training_size..].to_vec(); + let test_labels = labels[training_size..].to_vec(); + + let test_data = test_data.iter().flatten().copied().collect::>(); + let test_data_tensor = + Tensor::from_slice(&test_data, (test_labels.len(), FEATURE_CNT), device)?; + let test_labels_tensor = Tensor::from_slice(&test_labels, (test_labels.len(),), device)?; + + Ok(Dataset { + training_data: training_data_tensor, + training_labels: training_labels_tensor, + test_data: test_data_tensor, + test_labels: test_labels_tensor, + feature_cnt: FEATURE_CNT, + }) +} + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + #[arg(long)] + data_csv: String, + + // Print the Cost and Loss at each epoch + #[arg(long, default_value_t = false)] + progress: bool, + + // The learning rate + #[arg(long, default_value = "0.01")] + learning_rate: f32, + + // The regularization parameter + #[arg(long, default_value = "0.01")] + regularization: f32, + + // The number of epochs + #[arg(long, default_value = "10000")] + epochs: i32, +} +fn main() -> Result<()> { + let args = Args::parse(); + let file_path = args.data_csv; + + let device = Rc::new(Device::cuda_if_available(0)?); + let dataset = insurance_dataset(&file_path, &device)?; + + let mut model = LinearRegression::new(dataset.feature_cnt, device)?; + let (training_size, _) = dataset.training_data.shape().dims2()?; + let n_batches = training_size / BATCH_SIZE; + let mut batch_idxs = (0..n_batches).collect::>(); + + for epoch in 0..args.epochs { + let mut sum_loss = 0.0; + batch_idxs.shuffle(&mut rand::thread_rng()); + for batch_idx in batch_idxs.iter() { + let train_data = dataset + .training_data + .narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?; + let train_labels = + dataset + .training_labels + .narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?; + model.train( + &train_data, + &train_labels, + args.learning_rate, + args.regularization, + )?; + let predictions = model.hypothesis(&train_data)?; + let loss = model.loss(&predictions, &train_labels)?; + sum_loss += loss; + } + if args.progress && epoch % 1000 == 0 { + let predictions = model.hypothesis(&dataset.test_data)?; + let r2 = r2_score(&predictions, &dataset.test_labels).unwrap(); + println!("epoch: {epoch}, loss: {}, accuracy: {}", sum_loss / n_batches as f32, r2); + } + } + + + + Ok(()) +} diff --git a/logistic-regression/Cargo.toml b/logistic-regression/Cargo.toml new file mode 100644 index 0000000..63a19db --- /dev/null +++ b/logistic-regression/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "logistic-regression" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +csv = "1.1.6" +ndarray = "0.15.3" +anyhow = "1.0.40" +clap = {version = "4.3.1", features = ["derive"]} +rand = "0.8.5" +candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] } +candle-datasets = { git = "https://github.com/huggingface/candle.git", version = "0.4.1" } diff --git a/logistic-regression/README.md b/logistic-regression/README.md new file mode 100644 index 0000000..ba86884 --- /dev/null +++ b/logistic-regression/README.md @@ -0,0 +1,7 @@ +# Logistic Regression + +[Logistic regression](https://youtu.be/4u81xU7BIOc?si=7ZSLIqS-bzrBZWgL) (with regularization) model using gradient descent implemented for the MNIST [dataset](https://www.kaggle.com/datasets/hojjatk/mnist-dataset) to distinguish between zero vs non-zero digits. + +```bash +cargo run +``` \ No newline at end of file diff --git a/logistic-regression/src/main.rs b/logistic-regression/src/main.rs new file mode 100644 index 0000000..6a03a7b --- /dev/null +++ b/logistic-regression/src/main.rs @@ -0,0 +1,177 @@ +extern crate csv; +use anyhow::Result; +use candle_core::{Device, Tensor, D}; +use clap::Parser; +use rand::prelude::*; +use std::rc::Rc; + +// Implement Logistic Regression model using Gradient Descent +// https://www.youtube.com/watch?v=4u81xU7BIOc +struct LogisticRegression { + weights: Tensor, + bias: Tensor, + device: Rc, +} + +fn sigmoid(xs: &Tensor) -> Result { + Ok((xs.neg()?.exp()? + 1.0)?.recip()?) +} + +impl LogisticRegression { + fn new(feature_cnt: usize, device: Rc) -> Result { + let weights: Vec = vec![0.0; feature_cnt]; + let weights = Tensor::from_vec(weights, (feature_cnt,), &device)?; + let bias = Tensor::new(0.0f32, &device)?; + Ok(Self { + weights, + bias, + device, + }) + } + + fn hypothesis(&self, x: &Tensor) -> Result { + Ok(sigmoid( + &x.matmul(&self.weights.unsqueeze(1)?)? + .squeeze(1)? + .broadcast_add(&self.bias)?, + )?) + } + + fn loss(&self, y1: &Tensor, y2: &Tensor) -> Result { + let diff = y1.sub(y2)?; + let loss = diff.mul(&diff)?.mean(D::Minus1)?.to_scalar::()?; + Ok(loss) + } + + fn train( + &mut self, + x: &Tensor, + y: &Tensor, + learning_rate: f32, + regularization: f32, + ) -> Result<()> { + let m = y.shape().dims1()?; + let predictions = self.hypothesis(x)?; + let deltas = predictions.sub(y)?; + let regularization = self + .weights + .broadcast_mul(&Tensor::new(regularization / m as f32, &self.device)?)?; + + let gradient = x + .t()? + .matmul(&deltas.unsqueeze(D::Minus1)?)? + .broadcast_div(&Tensor::new(m as f32, &self.device)?)?; + let gradient = gradient + .squeeze(D::Minus1)? + .squeeze(D::Minus1)? + .add(®ularization)?; + + self.weights = self + .weights + .sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?; + let gradient = deltas.mean(D::Minus1)?; + self.bias = self + .bias + .sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?; + Ok(()) + } +} + +const BATCH_SIZE: usize = 100; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + // Print the Cost and Loss at each epoch + #[arg(long, default_value_t = false)] + progress: bool, + // The learning rate + #[arg(long, default_value = "0.01")] + learning_rate: f32, + + // Regularization parameter + #[arg(long, default_value = "0.1")] + regularization: f32, + + // The number of epochs + #[arg(long, default_value = "10000")] + epochs: i32, + + // The digit to classify + #[arg(long, default_value = "0")] + digit: u8, +} + +fn main() -> Result<()> { + let args = Args::parse(); + let device = Rc::new(Device::cuda_if_available(0)?); + + let dataset = candle_datasets::vision::mnist::load()?; + let (_, n) = dataset.train_images.shape().dims2()?; + let training_images = dataset.train_images.to_device(&device)?; + let training_labels = dataset.train_labels.to_device(&device)?; + let training_labels_vec = training_labels + .to_vec1::()? + .into_iter() + .map(|x| if x == args.digit { 1.0 } else { 0.0 }) + .collect::>(); + let len = training_labels_vec.len(); + let training_labels = Tensor::from_vec(training_labels_vec, (len,), &device)?; + + let test_images = dataset.test_images.to_device(&device)?; + let test_labels = dataset.test_labels.to_device(&device)?; + let test_labels_vec = test_labels + .to_vec1::()? + .into_iter() + .map(|x| if x == args.digit { 1f32 } else { 0f32 }) + .collect::>(); + let len = test_labels_vec.len(); + let test_labels = Tensor::from_vec(test_labels_vec, (len,), &device)?; + + let mut model = LogisticRegression::new(n, device.clone())?; + let (training_size, _) = training_images.shape().dims2()?; + let n_batches = training_size / BATCH_SIZE; + let mut batch_idxs = (0..n_batches).collect::>(); + + for epoch in 0..args.epochs { + let mut sum_loss = 0.0; + batch_idxs.shuffle(&mut rand::thread_rng()); + for batch_idx in batch_idxs.iter() { + let train_data = training_images.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?; + let train_labels = training_labels.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?; + model.train( + &train_data, + &train_labels, + args.learning_rate, + args.regularization, + )?; + let predictions = model.hypothesis(&train_data)?; + let loss = model.loss(&predictions, &train_labels)?; + sum_loss += loss; + } + if args.progress && epoch % 1000 == 0 { + let predictions = model.hypothesis(&test_images)?; + let predictions_vec = predictions + .to_vec1::()? + .into_iter() + .map(|x| if x > 0.5 { 1f32 } else { 0f32 }) + .collect::>(); + let predictions = Tensor::from_vec(predictions_vec, (len,), &device)?; + + let accuracy = predictions + .eq(&test_labels)? + .to_vec1::()? + .into_iter() + .map(f32::from) + .sum::() + / len as f32; + println!( + "epoch: {epoch}, loss: {}, Test Accuracy: {}", + sum_loss / n_batches as f32, + accuracy + ); + } + } + + Ok(()) +} diff --git a/neural-networks/Cargo.toml b/neural-networks/Cargo.toml new file mode 100644 index 0000000..7896041 --- /dev/null +++ b/neural-networks/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "neural-networks" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0.40" +csv = "1.1.6" +clap = { version = "4.5.1", features = ["derive"] } +rand = "0.8.5" +candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"]} +candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.4.1" } diff --git a/neural-networks/README.md b/neural-networks/README.md new file mode 100644 index 0000000..510168a --- /dev/null +++ b/neural-networks/README.md @@ -0,0 +1,3 @@ +## Multi-Classifier using Neural Networks + +A simple two hidden layer [Neural Network](https://youtu.be/UVjj2fHu9YU?si=R8-wuF1QAYDK_SGy) that can classify the images from [MNIST Fashion dataset](https://www.kaggle.com/datasets/zalando-research/fashionmnist). diff --git a/neural-networks/src/main.rs b/neural-networks/src/main.rs new file mode 100644 index 0000000..a3bb71a --- /dev/null +++ b/neural-networks/src/main.rs @@ -0,0 +1,135 @@ +use anyhow::Result; +use candle_core::{DType, Device, Tensor, D}; +use candle_nn::{loss, ops, Linear, Module, Optimizer, VarBuilder, VarMap}; +use clap::Parser; +use std::rc::Rc; + +const IMAGE_DIM: usize = 28 * 28; +const LABELS: usize = 10; + +struct Dataset { + pub training_data: Tensor, + pub training_labels: Tensor, + pub test_data: Tensor, + pub test_labels: Tensor, +} + +fn load_tensors(csv: &str, device: &Device) -> Result<(Tensor, Tensor)> { + let mut data = Vec::new(); + let mut labels = Vec::new(); + + let mut rdr = csv::Reader::from_path(csv)?; + for result in rdr.records() { + let record = result?; + let label = record.get(0).unwrap().parse::()?; + let mut features = Vec::new(); + for i in 1..record.len() { + features.push(record.get(i).unwrap().parse::()? / 255.0); + } + labels.push(label); + data.push(features); + } + + let data = data.into_iter().flatten().collect::>(); + let data = Tensor::from_slice(&data, (labels.len(), IMAGE_DIM), device)?; + let labels = Tensor::from_slice(&labels, (labels.len(),), device)?; + + Ok((data, labels)) +} + +fn load_dataset(train_csv: &str, test_csv: &str, device: &Device) -> Result { + let (training_data, training_labels) = load_tensors(train_csv, device)?; + let (test_data, test_labels) = load_tensors(test_csv, device)?; + + Ok(Dataset { + training_data, + training_labels, + test_data, + test_labels, + }) +} +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + #[arg(long)] + train_csv: String, + + #[arg(long)] + test_csv: String, + + // Print the Cost and Loss at each epoch + #[arg(long, default_value_t = false)] + progress: bool, + + // The learning rate + #[arg(long, default_value = "0.01")] + learning_rate: f64, + + // The regularization parameter + #[arg(long, default_value = "0.01")] + regularization: f32, + + // The number of epochs + #[arg(long, default_value = "5000")] + epochs: i32, +} + +struct Mlp { + ln1: Linear, + ln2: Linear, +} + +impl Mlp { + fn new(vs: VarBuilder) -> Result { + let ln1 = candle_nn::linear(IMAGE_DIM, 100, vs.pp("ln1"))?; + let ln2 = candle_nn::linear(100, LABELS, vs.pp("ln2"))?; + Ok(Self { ln1, ln2 }) + } + + fn forward(&self, xs: &Tensor) -> Result { + let xs = self.ln1.forward(xs)?; + let xs = xs.relu()?; + Ok(self.ln2.forward(&xs)?) + } +} + +fn main() -> Result<()> { + let args = Args::parse(); + let device = Rc::new(Device::cuda_if_available(0)?); + let dataset = load_dataset(&args.train_csv, &args.test_csv, &device)?; + + let varmap = VarMap::new(); + let vs = VarBuilder::from_varmap(&varmap, DType::F32, &device); + let model = Mlp::new(vs)?; + let mut sgd = candle_nn::SGD::new(varmap.all_vars(), args.learning_rate)?; + + let test_images = dataset.test_data.to_device(&device)?; + let test_labels = dataset + .test_labels + .to_dtype(DType::U32)? + .to_device(&device)?; + for epoch in 1..args.epochs { + let logits = model.forward(&dataset.training_data)?; + let log_sm = ops::log_softmax(&logits, D::Minus1)?; + let loss = loss::nll(&log_sm, &dataset.training_labels)?; + sgd.backward_step(&loss)?; + + let test_logits = model.forward(&test_images)?; + let sum_ok = test_logits + .argmax(D::Minus1)? + .eq(&test_labels)? + .to_dtype(DType::F32)? + .sum_all()? + .to_scalar::()?; + let test_accuracy = sum_ok / test_labels.dims1()? as f32; + if args.progress && epoch % 100 == 0 { + println!( + "{epoch:4} train loss: {:8.5} test acc: {:5.2}%", + loss.to_scalar::()?, + 100. * test_accuracy + ); + } + } + + Ok(()) +} diff --git a/pca/Cargo.toml b/pca/Cargo.toml new file mode 100644 index 0000000..141248a --- /dev/null +++ b/pca/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "pca" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +nalgebra = "0.32.4" +csv = "1.1.6" +anyhow = "1.0.40" +clap = {version = "4.3.1", features = ["derive"]} +candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] } \ No newline at end of file diff --git a/pca/README.md b/pca/README.md new file mode 100644 index 0000000..05b88ad --- /dev/null +++ b/pca/README.md @@ -0,0 +1,3 @@ +# Principal Component Analysis + +Implementation of the [PCA algorithm](https://youtu.be/pAwjiGkafbM?si=BBsViJAkIGD89_Ub) to reduce the dimensions of the Breast Cancer Wisconsin (Diagnostic) Data Set [dataset](https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data) \ No newline at end of file diff --git a/pca/src/main.rs b/pca/src/main.rs new file mode 100644 index 0000000..c896307 --- /dev/null +++ b/pca/src/main.rs @@ -0,0 +1,100 @@ +use anyhow::{Ok, Result}; +use candle_core::{Device, Tensor, D}; +use clap::Parser; +use nalgebra::linalg::SymmetricEigen; +use nalgebra::DMatrix; + +fn load_dataset(file_path: &str, device: &Device) -> Result { + let mut rdr = csv::Reader::from_path(file_path)?; + let mut data = Vec::new(); + for result in rdr.records() { + let record = result?; + let mut row = Vec::new(); + for i in 2..32 { + let value = record[i].parse::()?; + row.push(value); + } + data.push(row); + } + let feature_cnt = data[0].len(); + let sample_cnt = data.len(); + let data = data.into_iter().flatten().collect::>(); + let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?; + Ok(data) +} + +fn z_score_normalize(data: &Tensor) -> Result { + let mean = data.mean(0)?; + let squared_diff = data.broadcast_sub(&mean)?.sqr()?; + let variance = squared_diff.mean(0)?; + let std_dev = variance.sqrt()?; + let normalized = data.broadcast_sub(&mean)?.broadcast_div(&std_dev)?; + Ok(normalized) +} + +fn cov(data: &Tensor, device: &Device) -> Result { + let mean = data.mean(0)?; + let centered = data.broadcast_sub(&mean)?; + let (m, _) = data.shape().dims2()?; + let cov = centered + .transpose(D::Minus1, D::Minus2)? + .matmul(¢ered)? + .broadcast_div(&Tensor::new(m as f32, device)?)?; + + Ok(cov) +} + +fn pca(normalized_data: &Tensor, device: &Device, variance: f32) -> Result { + let (_, n) = normalized_data.shape().dims2()?; + let cov = cov(normalized_data, device)?; + let vec: Vec = cov + .to_device(&Device::Cpu)? + .to_vec2()? + .into_iter() + .flatten() + .collect(); + let dmatrix = DMatrix::from_vec(n, n, vec); + let eig = SymmetricEigen::new(dmatrix); + let eigen_values = eig.eigenvalues.data.as_vec(); + let total = eigen_values.iter().sum::(); + let mut k = 0; + for i in 0..n { + let var = eigen_values[0..i].iter().sum::() / total; + if var > variance { + println!("{} components explain {}% of the variance", i, var * 100.0); + k = i; + break; + } + } + + let eigen_vectors = eig.eigenvectors.data.as_vec(); + let eigen_vectors = eigen_vectors + .chunks(n) + .take(k) + .flatten() + .copied() + .collect::>(); + let eigen_vectors = Tensor::from_slice(eigen_vectors.as_slice(), (k, n), device)?; + Ok(eigen_vectors) +} + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + // Data CSV file from https://www.kaggle.com/datasets/uciml/iris/data + #[arg(long)] + data_csv: String, + + #[arg(long, default_value = "0.95")] + variance: f32, +} +fn main() -> Result<()> { + let args = Args::parse(); + let device = Device::cuda_if_available(0)?; + let data = load_dataset(&args.data_csv, &device).unwrap(); + let normalized_data = z_score_normalize(&data)?; + let reduce = pca(&normalized_data, &device, args.variance)?; + let compressed_data = data.matmul(&reduce.transpose(D::Minus1, D::Minus2)?)?; + println!("Compressed data {:?}", compressed_data); + Ok(()) +} diff --git a/recommender-system/Cargo.toml b/recommender-system/Cargo.toml new file mode 100644 index 0000000..1144f10 --- /dev/null +++ b/recommender-system/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "recommender-system" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +csv = "1.1.6" +anyhow = "1.0.40" +clap = {version = "4.3.1", features = ["derive"]} +rand = "0.8.5" +candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] } \ No newline at end of file diff --git a/recommender-system/README.md b/recommender-system/README.md new file mode 100644 index 0000000..29d515a --- /dev/null +++ b/recommender-system/README.md @@ -0,0 +1,3 @@ +# Recommender System + +Build a Movie [recommender system](https://youtu.be/GIcuSNAAa4g?si=eiKFRfJXek15lO2_) using Collaborative filtering learning algorithm for the [MovieLens-100K](https://www.kaggle.com/datasets/rajmehra03/movielens100k/code) dataset. \ No newline at end of file diff --git a/recommender-system/src/main.rs b/recommender-system/src/main.rs new file mode 100644 index 0000000..a93d2cc --- /dev/null +++ b/recommender-system/src/main.rs @@ -0,0 +1,234 @@ +extern crate csv; +use rand::seq::SliceRandom; +use rand::thread_rng; +use std::collections::HashSet; +use std::vec; +use std::{cmp::Ordering, collections::HashMap}; + +use anyhow::Result; +use candle_core::{Device, Tensor, D}; +use clap::Parser; + +#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)] +struct Rating { + user: u32, + movie: u32, + rating_u32: u32, +} + +impl Rating { + fn rating(&self) -> f32 { + self.rating_u32 as f32 / 10.0 + } +} + +// Step 2: Implement `PartialOrd` and `Ord` for the struct. +impl PartialOrd for Rating { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Rating { + fn cmp(&self, other: &Self) -> Ordering { + // First compare by `userId`, then by `movieId`. + self.user + .cmp(&other.user) + .then_with(|| self.movie.cmp(&other.movie)) + } +} + +#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)] +struct MovieDistance { + id: u32, + distance: u32, +} + +// Step 2: Implement `PartialOrd` and `Ord` for the struct. +impl PartialOrd for MovieDistance { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for MovieDistance { + fn cmp(&self, other: &Self) -> Ordering { + self.distance.cmp(&other.distance) + } +} + +impl MovieDistance { + fn new(id: u32, distance: u32) -> Self { + Self { id, distance } + } +} + +fn load_ratings(file_path: &str) -> Result<(HashSet, HashSet, HashSet)> { + let mut rdr = csv::Reader::from_path(file_path)?; + let mut users: HashSet = HashSet::new(); + let mut movies: HashSet = HashSet::new(); + let mut ratings: HashSet = HashSet::new(); + + for result in rdr.records() { + let record = result?; + let user: u32 = record[0].parse()?; + let movie: u32 = record[1].parse()?; + let rating: f32 = record[2].parse()?; + let rating_u32 = (rating * 10.0).round() as u32; + users.insert(user); + movies.insert(movie); + ratings.insert(Rating { + user, + movie, + rating_u32, + }); + } + + Ok((users, movies, ratings)) +} + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + // Data CSV file from https://www.kaggle.com/c/eecs498/data + #[arg(long)] + ratings_csv: String, + + #[arg(long)] + movies_csv: String, + + // Number of epochs to train + #[arg(long, default_value = "250")] + epochs: u32, + + // Learning rate + #[arg(long, default_value = "0.01")] + lr: f32, + + // Regularization factor + #[arg(long, default_value = "0.01")] + reg: f32, + + // Number of features + #[arg(long, default_value = "100")] + n_features: usize, +} + +fn cdist(x1: &Tensor, x2: &Tensor) -> Result { + let diff = x1.sub(&x2)?; + let dist = diff.sqr()?.sum_all()?.sqrt()?; + Ok(dist) +} + +fn mean_normalization(ratings: &Tensor, R: &Tensor) -> Result { + let sum = ratings.mul(&R)?.sum(1)?; + let count = R.sum(1)?; + let mean = sum.div(&count)?; + let adjusted = ratings.broadcast_sub(&mean.unsqueeze(1)?)?; + Ok(adjusted) +} + +fn cost(X: &Tensor, W: &Tensor, Y: &Tensor, R: &Tensor) -> Result { + let c = X + .matmul(&W.t()?)? + .mul(&R)? + .sub(&Y.mul(&R)?)? + .sqr()? + .sum_all()? + .to_scalar::()?; + Ok(c) +} + +fn main() -> Result<()> { + let args = Args::parse(); + let reg = Tensor::new(args.reg, &Device::cuda_if_available(0)?)?; + let lr = Tensor::new(args.lr, &Device::cuda_if_available(0)?)?; + + let device = Device::cuda_if_available(0)?; + + let (users, movies, ratings) = load_ratings(&args.ratings_csv).unwrap(); + let mut users: Vec = users.into_iter().collect(); + users.sort(); + + let mut movies: Vec = movies.into_iter().collect(); + movies.sort(); + + let mut ratings: Vec = ratings.into_iter().collect(); + ratings.sort(); + + let n_users = users.len(); + let n_movies = movies.len(); + + println!("n_users: {}, n_movies: {}", n_users, n_movies); + + let mut Y = vec![vec![-1.0; n_users as usize]; n_movies as usize]; + let mut R = vec![vec![0.0; n_users as usize]; n_movies as usize]; + + for rating in ratings.iter() { + let i = movies.iter().position(|&x| x == rating.movie).unwrap(); + let j = users.iter().position(|&x| x == rating.user).unwrap(); + Y[i][j] = rating.rating(); + R[i][j] = 1.0; + } + let R = R.iter().flatten().copied().collect::>(); + let R = Tensor::from_slice(&R, (n_movies, n_users), &device)?; + + let Y = Y.iter().flatten().copied().collect::>(); + let Y = Tensor::from_slice(&Y, (n_movies, n_users), &device)?; + let Y = mean_normalization(&Y, &R)?; + + let mut X = Tensor::randn(0f32, 0.1, (n_movies, args.n_features), &device)?; + let mut W = Tensor::randn(0f32, 0.1, (n_users, args.n_features), &device)?; + + for i in 0..args.epochs { + let diff = X.matmul(&W.t()?)?.mul(&R)?.sub(&Y.mul(&R)?)?; + let grad_X = diff.matmul(&W)?.add(&X.broadcast_mul(®)?)?; + let grad_W = diff.t()?.matmul(&X)?.add(&W.broadcast_mul(®)?)?; + + X = X.sub(&grad_X.broadcast_mul(&lr)?)?; + W = W.sub(&grad_W.broadcast_mul(&lr)?)?; + } + + // Load movie titles + let mut rdr = csv::Reader::from_path(&args.movies_csv)?; + let mut movie_titles = HashMap::new(); + for result in rdr.records() { + let record = result?; + let movie_id: u32 = record[0].parse()?; + let title = record[1].to_string(); + movie_titles.insert(movie_id, title); + } + + // Choose a random movie and find similar movies + let mut rng = thread_rng(); + let random_movie_id = movies.choose(&mut rng).unwrap(); + println!("Random movie: {}", movie_titles[random_movie_id]); + + let random_movie_idx = movies.iter().position(|&x| x == *random_movie_id).unwrap(); + let random_index_tensor = Tensor::from_slice(&[random_movie_idx as u32], &[1], &device)?; + let random_movie_features = X.index_select(&random_index_tensor, 0)?; + + let mut movie_distances: Vec = Vec::new(); + for i in 0..n_movies { + let movie_index_tensor = Tensor::from_slice(&[i as u32], &[1], &device)?; + let movie_features = X.index_select(&movie_index_tensor, 0)?; + let dist = cdist(&random_movie_features, &movie_features)?; + let dist = dist.to_scalar::()?; + let movie_distance = MovieDistance::new(movies[i], (dist * 1000.0) as u32); + movie_distances.push(movie_distance); + } + + movie_distances.sort(); + for i in 0..10 { + let movie_id = movie_distances[i].id; + let distance = movie_distances[i].distance; + println!( + "{}: {} (distance: {})", + i + 1, + movie_titles[&movie_id], + distance + ); + } + + Ok(()) +}