Inital commit

This commit is contained in:
Vishal Patil
2024-04-01 20:19:08 -04:00
commit 26c8803281
22 changed files with 1226 additions and 0 deletions

5
README.md Normal file
View File

@@ -0,0 +1,5 @@
## Machine Learning with Rust using Candle
This repository features implementations of algorithms from the Stanford University [Machine Learning Course](https://www.youtube.com/@machinelearningandai3274), all crafted in Rust using the [Candle](https://github.com/huggingface/candle) crate. Each example leverages diverse datasets from Kaggle to demonstrate the algorithms' applications.
All of the examples are CUDA enabled but can run on a machine without a GPU as well, though might be quite slow.

View File

@@ -0,0 +1,13 @@
[package]
name = "anamoly-detection"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
csv = "1.1.6"
anyhow = "1.0.40"
clap = {version = "4.3.1", features = ["derive"]}
rand = "0.8.5"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }

View File

@@ -0,0 +1,3 @@
# Anamoly Detection
[Anamoly Detection](https://youtu.be/UqqPm-Q4aMo?si=TCZFJOJv94R1i71u) using Gaussian Distribution for the Kaggle [EECS 498 dataset](https://www.kaggle.com/c/eecs498/data).

View File

@@ -0,0 +1,100 @@
extern crate csv;
use std::vec;
use anyhow::Result;
use candle_core::{Device, Tensor};
use clap::Parser;
fn load_dataset(file_path: &str, device: &Device) -> Result<Tensor> {
let mut rdr = csv::Reader::from_path(file_path)?;
let mut data = Vec::new();
for result in rdr.records() {
let record = result?;
let mut row = vec![];
for i in 1..4 {
row.push(record[i].parse::<f64>()?);
}
data.push(row);
}
let feature_cnt = data[0].len();
let sample_cnt = data.len();
let data = data.into_iter().flatten().collect::<Vec<_>>();
let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?;
Ok(data)
}
fn z_score_normalize(data: &Tensor) -> Result<Tensor> {
let mean = data.mean(0)?;
let squared_diff = data.broadcast_sub(&mean)?.sqr()?;
let variance = squared_diff.mean(0)?;
let std_dev = variance.sqrt()?;
let normalized = data.broadcast_sub(&mean)?.broadcast_div(&std_dev)?;
Ok(normalized)
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
// Data CSV file from https://www.kaggle.com/c/eecs498/data
#[arg(long)]
data_csv: String,
#[arg(long, short, default_value = "false")]
print: bool,
#[arg(long, default_value = "0.001")]
episilon: f64,
}
fn p_x(
x: &Tensor,
mean: &Tensor,
two_variance: &Tensor,
two_pi_sqrt_std_dev: &Tensor,
) -> Result<f64> {
let px = x
.broadcast_sub(mean)?
.sqr()?
.broadcast_div(two_variance)?
.exp()?
.broadcast_mul(two_pi_sqrt_std_dev)?
.recip()?;
let px = px.to_vec1::<f64>()?.into_iter().fold(1.0, |acc, x| acc * x);
Ok(px)
}
fn main() -> Result<()> {
let args = Args::parse();
let device = Device::cuda_if_available(0)?;
let data = load_dataset(&args.data_csv, &device)?;
let data = z_score_normalize(&data)?;
let mean = data.mean(0)?;
let variance = data.broadcast_sub(&mean)?.sqr()?.mean(0)?;
let std_dev = variance.sqrt()?;
let two_variance = variance.broadcast_mul(&Tensor::new(2.0, &device)?)?;
let two_pi_sqrt_std_dev =
std_dev.broadcast_mul(&Tensor::new(2.0 * std::f64::consts::PI, &device)?.sqrt()?)?;
let rows = data.shape().dims2()?.0;
let mut anamolies = 0;
for row in 0..rows {
let row_tensor = data
.index_select(&Tensor::new(&[row as u32], &device)?, 0)?
.squeeze(0)?;
let px = p_x(&row_tensor, &mean, &two_variance, &two_pi_sqrt_std_dev)?;
if px < args.episilon {
anamolies += 1;
if args.print {
println!("Anamoly: {}", row + 1);
}
}
}
println!("Anamolies: {}, Total: {}", anamolies, rows);
Ok(())
}

13
k-means/Cargo.toml Normal file
View File

@@ -0,0 +1,13 @@
[package]
name = "k-means"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
csv = "1.1.6"
anyhow = "1.0.40"
clap = {version = "4.3.1", features = ["derive"]}
rand = "0.8.5"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }

3
k-means/README.md Normal file
View File

@@ -0,0 +1,3 @@
# K-Means clustering
[K-Means clustering](https://youtu.be/0D4LnsJr85Y?si=qmkMIqY39rMJUBDk) for the Kaggle [Irisi dataset](https://www.kaggle.com/datasets/uciml/iris/data)

106
k-means/src/main.rs Normal file
View File

@@ -0,0 +1,106 @@
extern crate csv;
use std::vec;
use anyhow::Result;
use candle_core::{DType, Device, Tensor, D};
use clap::Parser;
use rand::prelude::*;
fn cdist(x1: &Tensor, x2: &Tensor) -> Result<Tensor> {
let x1 = x1.unsqueeze(0)?;
let x2 = x2.unsqueeze(1)?;
Ok(x1
.broadcast_sub(&x2)?
.sqr()?
.sum(D::Minus1)?
.sqrt()?
.transpose(D::Minus1, D::Minus2)?)
}
fn load_dataset(file_path: &str, device: &Device) -> Result<Tensor> {
let mut rdr = csv::Reader::from_path(file_path)?;
let mut data = Vec::new();
for result in rdr.records() {
let record = result?;
let mut row = vec![];
for i in 1..5 {
row.push(record[i].parse::<f64>()?);
}
data.push(row);
}
let feature_cnt = data[0].len();
let sample_cnt = data.len();
let data = data.into_iter().flatten().collect::<Vec<_>>();
let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?;
Ok(data)
}
fn k_means(data: &Tensor, k: usize, max_iter: i64, device: &Device) -> Result<(Tensor, Tensor)> {
let (n, _) = data.dims2()?;
let mut rng = rand::thread_rng();
let mut indices = (0..n).collect::<Vec<_>>();
indices.shuffle(&mut rng);
let centroid_idx = indices[..k]
.iter()
.copied()
.map(|x| x as i64)
.collect::<Vec<_>>();
let centroid_idx_tensor = Tensor::from_slice(centroid_idx.as_slice(), (k,), device)?;
let mut centers = data.index_select(&centroid_idx_tensor, 0)?;
let mut cluster_assignments = Tensor::zeros((n,), DType::U32, device)?;
for _ in 0..max_iter {
let dist = cdist(data, &centers)?;
cluster_assignments = dist.argmin(D::Minus1)?;
let mut centers_vec = vec![];
for i in 0..k {
let mut indices = vec![];
cluster_assignments
.to_vec1::<u32>()?
.iter()
.enumerate()
.for_each(|(j, x)| {
if *x == i as u32 {
indices.push(j as u32);
}
});
let indices = Tensor::from_slice(indices.as_slice(), (indices.len(),), device)?;
let cluster_data = data.index_select(&indices, 0)?;
let mean = cluster_data.mean(0)?;
centers_vec.push(mean);
}
centers = Tensor::stack(centers_vec.as_slice(), 0)?;
}
Ok((centers, cluster_assignments))
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
// Data CSV file from https://www.kaggle.com/datasets/uciml/iris/data
#[arg(long)]
data_csv: String,
// Number of clusters
#[arg(long, default_value = "3")]
k: usize,
// Maximum number of iterations
#[arg(long, default_value = "100")]
max_iter: i64,
}
fn main() -> Result<()> {
let args = Args::parse();
let device = Device::cuda_if_available(0)?;
let data = load_dataset(&args.data_csv, &device).unwrap();
let (centers, cluster_assignments) = k_means(&data, args.k, args.max_iter, &device)?;
println!("{}", centers);
println!("{}", cluster_assignments);
let cluster_sizes = cluster_assignments.to_vec1::<u32>()?;
for i in 0..args.k {
let size = cluster_sizes.iter().filter(|&&x| x == i as u32).count();
println!("Cluster {} size: {}", i, size);
}
Ok(())
}

View File

@@ -0,0 +1,14 @@
[package]
name = "linear-regression"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
csv = "1.1.6"
ndarray = "0.15.3"
candle = { git = "https://github.com/huggingface/candle", package = "candle-core", features = ["cuda"]}
anyhow = "1.0.40"
clap = { version = "4.5.1", features = ["derive"] }
rand = "0.8.5"

View File

@@ -0,0 +1,7 @@
# Linear regression
[Linear regression](https://youtu.be/W46UTQ_JDPk?si=dfz9_kFBUkM3E1RR) (with regularization) model using gradient descent implemented for the Kaggle [insurance dataset](https://www.kaggle.com/code/kianwee/linear-regression-insurance-dataset).
```bash
cargo run -- --data-csv ./insurance.csv
```

View File

@@ -0,0 +1,245 @@
extern crate csv;
use anyhow::Result;
use candle::{Device, Tensor, D};
use clap::Parser;
use core::panic;
use rand::prelude::*;
use std::fs::File;
use std::rc::Rc;
struct Dataset {
pub training_data: Tensor,
pub training_labels: Tensor,
pub test_data: Tensor,
pub test_labels: Tensor,
pub feature_cnt: usize,
}
// Implement Linear Regression model using Gradient Descent
// https://www.youtube.com/watch?v=UVCFaaEBnTE
struct LinearRegression {
weights: Tensor,
bias: Tensor,
device: Rc<Device>,
}
impl LinearRegression {
fn new(feature_cnt: usize, device: Rc<Device>) -> Result<Self> {
let weights: Vec<f32> = vec![0.0; feature_cnt];
let weights = Tensor::from_vec(weights, (feature_cnt,), &device)?;
let bias = Tensor::new(0.0f32, &device)?;
Ok(Self {
weights,
bias,
device,
})
}
fn hypothesis(&self, x: &Tensor) -> Result<Tensor> {
Ok(x.matmul(&self.weights.unsqueeze(1)?)?
.squeeze(1)?
.broadcast_add(&self.bias)?)
}
fn loss(&self, y1: &Tensor, y2: &Tensor) -> Result<f32> {
let diff = y1.sub(y2)?;
let loss = diff.mul(&diff)?.mean(D::Minus1)?.to_scalar::<f32>()?;
Ok(loss)
}
fn train(
&mut self,
x: &Tensor,
y: &Tensor,
learning_rate: f32,
regularization: f32,
) -> Result<()> {
let m = y.shape().dims1()?;
let predictions = self.hypothesis(x)?;
let deltas = predictions.sub(y)?;
let regularization = self
.weights
.broadcast_mul(&Tensor::new(regularization / m as f32, &self.device)?)?;
let gradient = x
.t()?
.matmul(&deltas.unsqueeze(D::Minus1)?)?
.broadcast_div(&Tensor::new(m as f32, &self.device)?)?;
let gradient = gradient
.squeeze(D::Minus1)?
.squeeze(D::Minus1)?
.add(&regularization)?;
self.weights = self
.weights
.sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?;
let gradient = deltas.mean(D::Minus1)?;
self.bias = self
.bias
.sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?;
Ok(())
}
}
fn r2_score(predictions: &Tensor, labels: &Tensor) -> Result<f32, Box<dyn std::error::Error>> {
let mean = labels.mean(D::Minus1)?;
let ssr = labels.sub(predictions)?;
let ssr = ssr.mul(&ssr)?.sum(D::Minus1)?;
let sst = labels.broadcast_sub(&mean)?;
let sst = sst.mul(&sst)?.sum(D::Minus1)?;
let tmp = ssr.div(&sst)?.to_scalar::<f32>()?;
Ok(1.0 - tmp)
}
const BATCH_SIZE: usize = 100;
fn insurance_dataset(file_path: &str, device: &Device) -> Result<Dataset> {
// https://www.kaggle.com/mirichoi0218/insurance
let file = File::open(file_path)?;
let mut rdr = csv::Reader::from_reader(file);
let mut data: Vec<Vec<f32>> = vec![];
let mut labels: Vec<f32> = vec![];
const FEATURE_CNT: usize = 6;
const MALE: f32 = 0.5;
const FEMALE: f32 = -0.5;
const YES: f32 = 0.5;
const NO: f32 = -0.5;
const NORTHWEST: f32 = 0.25;
const NORTHEAST: f32 = -0.25;
const SOUTHWEST: f32 = 0.5;
const SOUTHEAST: f32 = -0.5;
for result in rdr.records() {
let record = result?;
let age: f32 = (record[0].parse::<u32>()? as f32) / 100.0;
let gender = match record[1].parse::<String>()?.as_str() {
"male" => MALE,
"female" => FEMALE,
_ => panic!("Invalid Gender"),
};
let bmi: f32 = record[2].parse::<f32>()? / 100.0;
let children: f32 = record[3].parse()?;
let smoker = match record[4].parse::<String>()?.as_str() {
"yes" => YES,
"no" => NO,
_ => panic!("Invalid Smoker"),
};
let region = match record[5].parse::<String>()?.as_str() {
"northwest" => NORTHWEST,
"northeast" => NORTHEAST,
"southwest" => SOUTHWEST,
"southeast" => SOUTHEAST,
_ => panic!("Invalid Region"),
};
let charges: f32 = record[6].parse()?;
let row = vec![age, gender, bmi, children, smoker, region];
data.push(row);
let label = charges;
labels.push(label);
}
let training_size = labels.len() * 8 / 10;
let training_data = data[..training_size].to_vec();
let training_labels = labels[..training_size].to_vec();
let training_data = training_data
.iter()
.flatten()
.copied()
.collect::<Vec<f32>>();
let training_data_tensor =
Tensor::from_slice(&training_data, (training_labels.len(), FEATURE_CNT), device)?;
let training_labels_tensor =
Tensor::from_slice(&training_labels, (training_labels.len(),), device)?;
let test_data = data[training_size..].to_vec();
let test_labels = labels[training_size..].to_vec();
let test_data = test_data.iter().flatten().copied().collect::<Vec<f32>>();
let test_data_tensor =
Tensor::from_slice(&test_data, (test_labels.len(), FEATURE_CNT), device)?;
let test_labels_tensor = Tensor::from_slice(&test_labels, (test_labels.len(),), device)?;
Ok(Dataset {
training_data: training_data_tensor,
training_labels: training_labels_tensor,
test_data: test_data_tensor,
test_labels: test_labels_tensor,
feature_cnt: FEATURE_CNT,
})
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
#[arg(long)]
data_csv: String,
// Print the Cost and Loss at each epoch
#[arg(long, default_value_t = false)]
progress: bool,
// The learning rate
#[arg(long, default_value = "0.01")]
learning_rate: f32,
// The regularization parameter
#[arg(long, default_value = "0.01")]
regularization: f32,
// The number of epochs
#[arg(long, default_value = "10000")]
epochs: i32,
}
fn main() -> Result<()> {
let args = Args::parse();
let file_path = args.data_csv;
let device = Rc::new(Device::cuda_if_available(0)?);
let dataset = insurance_dataset(&file_path, &device)?;
let mut model = LinearRegression::new(dataset.feature_cnt, device)?;
let (training_size, _) = dataset.training_data.shape().dims2()?;
let n_batches = training_size / BATCH_SIZE;
let mut batch_idxs = (0..n_batches).collect::<Vec<usize>>();
for epoch in 0..args.epochs {
let mut sum_loss = 0.0;
batch_idxs.shuffle(&mut rand::thread_rng());
for batch_idx in batch_idxs.iter() {
let train_data = dataset
.training_data
.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?;
let train_labels =
dataset
.training_labels
.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?;
model.train(
&train_data,
&train_labels,
args.learning_rate,
args.regularization,
)?;
let predictions = model.hypothesis(&train_data)?;
let loss = model.loss(&predictions, &train_labels)?;
sum_loss += loss;
}
if args.progress && epoch % 1000 == 0 {
let predictions = model.hypothesis(&dataset.test_data)?;
let r2 = r2_score(&predictions, &dataset.test_labels).unwrap();
println!("epoch: {epoch}, loss: {}, accuracy: {}", sum_loss / n_batches as f32, r2);
}
}
Ok(())
}

View File

@@ -0,0 +1,15 @@
[package]
name = "logistic-regression"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
csv = "1.1.6"
ndarray = "0.15.3"
anyhow = "1.0.40"
clap = {version = "4.3.1", features = ["derive"]}
rand = "0.8.5"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }
candle-datasets = { git = "https://github.com/huggingface/candle.git", version = "0.4.1" }

View File

@@ -0,0 +1,7 @@
# Logistic Regression
[Logistic regression](https://youtu.be/4u81xU7BIOc?si=7ZSLIqS-bzrBZWgL) (with regularization) model using gradient descent implemented for the MNIST [dataset](https://www.kaggle.com/datasets/hojjatk/mnist-dataset) to distinguish between zero vs non-zero digits.
```bash
cargo run
```

View File

@@ -0,0 +1,177 @@
extern crate csv;
use anyhow::Result;
use candle_core::{Device, Tensor, D};
use clap::Parser;
use rand::prelude::*;
use std::rc::Rc;
// Implement Logistic Regression model using Gradient Descent
// https://www.youtube.com/watch?v=4u81xU7BIOc
struct LogisticRegression {
weights: Tensor,
bias: Tensor,
device: Rc<Device>,
}
fn sigmoid(xs: &Tensor) -> Result<Tensor> {
Ok((xs.neg()?.exp()? + 1.0)?.recip()?)
}
impl LogisticRegression {
fn new(feature_cnt: usize, device: Rc<Device>) -> Result<Self> {
let weights: Vec<f32> = vec![0.0; feature_cnt];
let weights = Tensor::from_vec(weights, (feature_cnt,), &device)?;
let bias = Tensor::new(0.0f32, &device)?;
Ok(Self {
weights,
bias,
device,
})
}
fn hypothesis(&self, x: &Tensor) -> Result<Tensor> {
Ok(sigmoid(
&x.matmul(&self.weights.unsqueeze(1)?)?
.squeeze(1)?
.broadcast_add(&self.bias)?,
)?)
}
fn loss(&self, y1: &Tensor, y2: &Tensor) -> Result<f32> {
let diff = y1.sub(y2)?;
let loss = diff.mul(&diff)?.mean(D::Minus1)?.to_scalar::<f32>()?;
Ok(loss)
}
fn train(
&mut self,
x: &Tensor,
y: &Tensor,
learning_rate: f32,
regularization: f32,
) -> Result<()> {
let m = y.shape().dims1()?;
let predictions = self.hypothesis(x)?;
let deltas = predictions.sub(y)?;
let regularization = self
.weights
.broadcast_mul(&Tensor::new(regularization / m as f32, &self.device)?)?;
let gradient = x
.t()?
.matmul(&deltas.unsqueeze(D::Minus1)?)?
.broadcast_div(&Tensor::new(m as f32, &self.device)?)?;
let gradient = gradient
.squeeze(D::Minus1)?
.squeeze(D::Minus1)?
.add(&regularization)?;
self.weights = self
.weights
.sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?;
let gradient = deltas.mean(D::Minus1)?;
self.bias = self
.bias
.sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?;
Ok(())
}
}
const BATCH_SIZE: usize = 100;
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
// Print the Cost and Loss at each epoch
#[arg(long, default_value_t = false)]
progress: bool,
// The learning rate
#[arg(long, default_value = "0.01")]
learning_rate: f32,
// Regularization parameter
#[arg(long, default_value = "0.1")]
regularization: f32,
// The number of epochs
#[arg(long, default_value = "10000")]
epochs: i32,
// The digit to classify
#[arg(long, default_value = "0")]
digit: u8,
}
fn main() -> Result<()> {
let args = Args::parse();
let device = Rc::new(Device::cuda_if_available(0)?);
let dataset = candle_datasets::vision::mnist::load()?;
let (_, n) = dataset.train_images.shape().dims2()?;
let training_images = dataset.train_images.to_device(&device)?;
let training_labels = dataset.train_labels.to_device(&device)?;
let training_labels_vec = training_labels
.to_vec1::<u8>()?
.into_iter()
.map(|x| if x == args.digit { 1.0 } else { 0.0 })
.collect::<Vec<f32>>();
let len = training_labels_vec.len();
let training_labels = Tensor::from_vec(training_labels_vec, (len,), &device)?;
let test_images = dataset.test_images.to_device(&device)?;
let test_labels = dataset.test_labels.to_device(&device)?;
let test_labels_vec = test_labels
.to_vec1::<u8>()?
.into_iter()
.map(|x| if x == args.digit { 1f32 } else { 0f32 })
.collect::<Vec<f32>>();
let len = test_labels_vec.len();
let test_labels = Tensor::from_vec(test_labels_vec, (len,), &device)?;
let mut model = LogisticRegression::new(n, device.clone())?;
let (training_size, _) = training_images.shape().dims2()?;
let n_batches = training_size / BATCH_SIZE;
let mut batch_idxs = (0..n_batches).collect::<Vec<usize>>();
for epoch in 0..args.epochs {
let mut sum_loss = 0.0;
batch_idxs.shuffle(&mut rand::thread_rng());
for batch_idx in batch_idxs.iter() {
let train_data = training_images.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?;
let train_labels = training_labels.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?;
model.train(
&train_data,
&train_labels,
args.learning_rate,
args.regularization,
)?;
let predictions = model.hypothesis(&train_data)?;
let loss = model.loss(&predictions, &train_labels)?;
sum_loss += loss;
}
if args.progress && epoch % 1000 == 0 {
let predictions = model.hypothesis(&test_images)?;
let predictions_vec = predictions
.to_vec1::<f32>()?
.into_iter()
.map(|x| if x > 0.5 { 1f32 } else { 0f32 })
.collect::<Vec<f32>>();
let predictions = Tensor::from_vec(predictions_vec, (len,), &device)?;
let accuracy = predictions
.eq(&test_labels)?
.to_vec1::<u8>()?
.into_iter()
.map(f32::from)
.sum::<f32>()
/ len as f32;
println!(
"epoch: {epoch}, loss: {}, Test Accuracy: {}",
sum_loss / n_batches as f32,
accuracy
);
}
}
Ok(())
}

View File

@@ -0,0 +1,14 @@
[package]
name = "neural-networks"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.40"
csv = "1.1.6"
clap = { version = "4.5.1", features = ["derive"] }
rand = "0.8.5"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"]}
candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.4.1" }

View File

@@ -0,0 +1,3 @@
## Multi-Classifier using Neural Networks
A simple two hidden layer [Neural Network](https://youtu.be/UVjj2fHu9YU?si=R8-wuF1QAYDK_SGy) that can classify the images from [MNIST Fashion dataset](https://www.kaggle.com/datasets/zalando-research/fashionmnist).

135
neural-networks/src/main.rs Normal file
View File

@@ -0,0 +1,135 @@
use anyhow::Result;
use candle_core::{DType, Device, Tensor, D};
use candle_nn::{loss, ops, Linear, Module, Optimizer, VarBuilder, VarMap};
use clap::Parser;
use std::rc::Rc;
const IMAGE_DIM: usize = 28 * 28;
const LABELS: usize = 10;
struct Dataset {
pub training_data: Tensor,
pub training_labels: Tensor,
pub test_data: Tensor,
pub test_labels: Tensor,
}
fn load_tensors(csv: &str, device: &Device) -> Result<(Tensor, Tensor)> {
let mut data = Vec::new();
let mut labels = Vec::new();
let mut rdr = csv::Reader::from_path(csv)?;
for result in rdr.records() {
let record = result?;
let label = record.get(0).unwrap().parse::<u32>()?;
let mut features = Vec::new();
for i in 1..record.len() {
features.push(record.get(i).unwrap().parse::<f32>()? / 255.0);
}
labels.push(label);
data.push(features);
}
let data = data.into_iter().flatten().collect::<Vec<f32>>();
let data = Tensor::from_slice(&data, (labels.len(), IMAGE_DIM), device)?;
let labels = Tensor::from_slice(&labels, (labels.len(),), device)?;
Ok((data, labels))
}
fn load_dataset(train_csv: &str, test_csv: &str, device: &Device) -> Result<Dataset> {
let (training_data, training_labels) = load_tensors(train_csv, device)?;
let (test_data, test_labels) = load_tensors(test_csv, device)?;
Ok(Dataset {
training_data,
training_labels,
test_data,
test_labels,
})
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
#[arg(long)]
train_csv: String,
#[arg(long)]
test_csv: String,
// Print the Cost and Loss at each epoch
#[arg(long, default_value_t = false)]
progress: bool,
// The learning rate
#[arg(long, default_value = "0.01")]
learning_rate: f64,
// The regularization parameter
#[arg(long, default_value = "0.01")]
regularization: f32,
// The number of epochs
#[arg(long, default_value = "5000")]
epochs: i32,
}
struct Mlp {
ln1: Linear,
ln2: Linear,
}
impl Mlp {
fn new(vs: VarBuilder) -> Result<Self> {
let ln1 = candle_nn::linear(IMAGE_DIM, 100, vs.pp("ln1"))?;
let ln2 = candle_nn::linear(100, LABELS, vs.pp("ln2"))?;
Ok(Self { ln1, ln2 })
}
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
let xs = self.ln1.forward(xs)?;
let xs = xs.relu()?;
Ok(self.ln2.forward(&xs)?)
}
}
fn main() -> Result<()> {
let args = Args::parse();
let device = Rc::new(Device::cuda_if_available(0)?);
let dataset = load_dataset(&args.train_csv, &args.test_csv, &device)?;
let varmap = VarMap::new();
let vs = VarBuilder::from_varmap(&varmap, DType::F32, &device);
let model = Mlp::new(vs)?;
let mut sgd = candle_nn::SGD::new(varmap.all_vars(), args.learning_rate)?;
let test_images = dataset.test_data.to_device(&device)?;
let test_labels = dataset
.test_labels
.to_dtype(DType::U32)?
.to_device(&device)?;
for epoch in 1..args.epochs {
let logits = model.forward(&dataset.training_data)?;
let log_sm = ops::log_softmax(&logits, D::Minus1)?;
let loss = loss::nll(&log_sm, &dataset.training_labels)?;
sgd.backward_step(&loss)?;
let test_logits = model.forward(&test_images)?;
let sum_ok = test_logits
.argmax(D::Minus1)?
.eq(&test_labels)?
.to_dtype(DType::F32)?
.sum_all()?
.to_scalar::<f32>()?;
let test_accuracy = sum_ok / test_labels.dims1()? as f32;
if args.progress && epoch % 100 == 0 {
println!(
"{epoch:4} train loss: {:8.5} test acc: {:5.2}%",
loss.to_scalar::<f32>()?,
100. * test_accuracy
);
}
}
Ok(())
}

13
pca/Cargo.toml Normal file
View File

@@ -0,0 +1,13 @@
[package]
name = "pca"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
nalgebra = "0.32.4"
csv = "1.1.6"
anyhow = "1.0.40"
clap = {version = "4.3.1", features = ["derive"]}
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }

3
pca/README.md Normal file
View File

@@ -0,0 +1,3 @@
# Principal Component Analysis
Implementation of the [PCA algorithm](https://youtu.be/pAwjiGkafbM?si=BBsViJAkIGD89_Ub) to reduce the dimensions of the Breast Cancer Wisconsin (Diagnostic) Data Set [dataset](https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data)

100
pca/src/main.rs Normal file
View File

@@ -0,0 +1,100 @@
use anyhow::{Ok, Result};
use candle_core::{Device, Tensor, D};
use clap::Parser;
use nalgebra::linalg::SymmetricEigen;
use nalgebra::DMatrix;
fn load_dataset(file_path: &str, device: &Device) -> Result<Tensor> {
let mut rdr = csv::Reader::from_path(file_path)?;
let mut data = Vec::new();
for result in rdr.records() {
let record = result?;
let mut row = Vec::new();
for i in 2..32 {
let value = record[i].parse::<f32>()?;
row.push(value);
}
data.push(row);
}
let feature_cnt = data[0].len();
let sample_cnt = data.len();
let data = data.into_iter().flatten().collect::<Vec<_>>();
let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?;
Ok(data)
}
fn z_score_normalize(data: &Tensor) -> Result<Tensor> {
let mean = data.mean(0)?;
let squared_diff = data.broadcast_sub(&mean)?.sqr()?;
let variance = squared_diff.mean(0)?;
let std_dev = variance.sqrt()?;
let normalized = data.broadcast_sub(&mean)?.broadcast_div(&std_dev)?;
Ok(normalized)
}
fn cov(data: &Tensor, device: &Device) -> Result<Tensor> {
let mean = data.mean(0)?;
let centered = data.broadcast_sub(&mean)?;
let (m, _) = data.shape().dims2()?;
let cov = centered
.transpose(D::Minus1, D::Minus2)?
.matmul(&centered)?
.broadcast_div(&Tensor::new(m as f32, device)?)?;
Ok(cov)
}
fn pca(normalized_data: &Tensor, device: &Device, variance: f32) -> Result<Tensor> {
let (_, n) = normalized_data.shape().dims2()?;
let cov = cov(normalized_data, device)?;
let vec: Vec<f32> = cov
.to_device(&Device::Cpu)?
.to_vec2()?
.into_iter()
.flatten()
.collect();
let dmatrix = DMatrix::from_vec(n, n, vec);
let eig = SymmetricEigen::new(dmatrix);
let eigen_values = eig.eigenvalues.data.as_vec();
let total = eigen_values.iter().sum::<f32>();
let mut k = 0;
for i in 0..n {
let var = eigen_values[0..i].iter().sum::<f32>() / total;
if var > variance {
println!("{} components explain {}% of the variance", i, var * 100.0);
k = i;
break;
}
}
let eigen_vectors = eig.eigenvectors.data.as_vec();
let eigen_vectors = eigen_vectors
.chunks(n)
.take(k)
.flatten()
.copied()
.collect::<Vec<_>>();
let eigen_vectors = Tensor::from_slice(eigen_vectors.as_slice(), (k, n), device)?;
Ok(eigen_vectors)
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
// Data CSV file from https://www.kaggle.com/datasets/uciml/iris/data
#[arg(long)]
data_csv: String,
#[arg(long, default_value = "0.95")]
variance: f32,
}
fn main() -> Result<()> {
let args = Args::parse();
let device = Device::cuda_if_available(0)?;
let data = load_dataset(&args.data_csv, &device).unwrap();
let normalized_data = z_score_normalize(&data)?;
let reduce = pca(&normalized_data, &device, args.variance)?;
let compressed_data = data.matmul(&reduce.transpose(D::Minus1, D::Minus2)?)?;
println!("Compressed data {:?}", compressed_data);
Ok(())
}

View File

@@ -0,0 +1,13 @@
[package]
name = "recommender-system"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
csv = "1.1.6"
anyhow = "1.0.40"
clap = {version = "4.3.1", features = ["derive"]}
rand = "0.8.5"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }

View File

@@ -0,0 +1,3 @@
# Recommender System
Build a Movie [recommender system](https://youtu.be/GIcuSNAAa4g?si=eiKFRfJXek15lO2_) using Collaborative filtering learning algorithm for the [MovieLens-100K](https://www.kaggle.com/datasets/rajmehra03/movielens100k/code) dataset.

View File

@@ -0,0 +1,234 @@
extern crate csv;
use rand::seq::SliceRandom;
use rand::thread_rng;
use std::collections::HashSet;
use std::vec;
use std::{cmp::Ordering, collections::HashMap};
use anyhow::Result;
use candle_core::{Device, Tensor, D};
use clap::Parser;
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
struct Rating {
user: u32,
movie: u32,
rating_u32: u32,
}
impl Rating {
fn rating(&self) -> f32 {
self.rating_u32 as f32 / 10.0
}
}
// Step 2: Implement `PartialOrd` and `Ord` for the struct.
impl PartialOrd for Rating {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Rating {
fn cmp(&self, other: &Self) -> Ordering {
// First compare by `userId`, then by `movieId`.
self.user
.cmp(&other.user)
.then_with(|| self.movie.cmp(&other.movie))
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
struct MovieDistance {
id: u32,
distance: u32,
}
// Step 2: Implement `PartialOrd` and `Ord` for the struct.
impl PartialOrd for MovieDistance {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for MovieDistance {
fn cmp(&self, other: &Self) -> Ordering {
self.distance.cmp(&other.distance)
}
}
impl MovieDistance {
fn new(id: u32, distance: u32) -> Self {
Self { id, distance }
}
}
fn load_ratings(file_path: &str) -> Result<(HashSet<u32>, HashSet<u32>, HashSet<Rating>)> {
let mut rdr = csv::Reader::from_path(file_path)?;
let mut users: HashSet<u32> = HashSet::new();
let mut movies: HashSet<u32> = HashSet::new();
let mut ratings: HashSet<Rating> = HashSet::new();
for result in rdr.records() {
let record = result?;
let user: u32 = record[0].parse()?;
let movie: u32 = record[1].parse()?;
let rating: f32 = record[2].parse()?;
let rating_u32 = (rating * 10.0).round() as u32;
users.insert(user);
movies.insert(movie);
ratings.insert(Rating {
user,
movie,
rating_u32,
});
}
Ok((users, movies, ratings))
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
// Data CSV file from https://www.kaggle.com/c/eecs498/data
#[arg(long)]
ratings_csv: String,
#[arg(long)]
movies_csv: String,
// Number of epochs to train
#[arg(long, default_value = "250")]
epochs: u32,
// Learning rate
#[arg(long, default_value = "0.01")]
lr: f32,
// Regularization factor
#[arg(long, default_value = "0.01")]
reg: f32,
// Number of features
#[arg(long, default_value = "100")]
n_features: usize,
}
fn cdist(x1: &Tensor, x2: &Tensor) -> Result<Tensor> {
let diff = x1.sub(&x2)?;
let dist = diff.sqr()?.sum_all()?.sqrt()?;
Ok(dist)
}
fn mean_normalization(ratings: &Tensor, R: &Tensor) -> Result<Tensor> {
let sum = ratings.mul(&R)?.sum(1)?;
let count = R.sum(1)?;
let mean = sum.div(&count)?;
let adjusted = ratings.broadcast_sub(&mean.unsqueeze(1)?)?;
Ok(adjusted)
}
fn cost(X: &Tensor, W: &Tensor, Y: &Tensor, R: &Tensor) -> Result<f32> {
let c = X
.matmul(&W.t()?)?
.mul(&R)?
.sub(&Y.mul(&R)?)?
.sqr()?
.sum_all()?
.to_scalar::<f32>()?;
Ok(c)
}
fn main() -> Result<()> {
let args = Args::parse();
let reg = Tensor::new(args.reg, &Device::cuda_if_available(0)?)?;
let lr = Tensor::new(args.lr, &Device::cuda_if_available(0)?)?;
let device = Device::cuda_if_available(0)?;
let (users, movies, ratings) = load_ratings(&args.ratings_csv).unwrap();
let mut users: Vec<u32> = users.into_iter().collect();
users.sort();
let mut movies: Vec<u32> = movies.into_iter().collect();
movies.sort();
let mut ratings: Vec<Rating> = ratings.into_iter().collect();
ratings.sort();
let n_users = users.len();
let n_movies = movies.len();
println!("n_users: {}, n_movies: {}", n_users, n_movies);
let mut Y = vec![vec![-1.0; n_users as usize]; n_movies as usize];
let mut R = vec![vec![0.0; n_users as usize]; n_movies as usize];
for rating in ratings.iter() {
let i = movies.iter().position(|&x| x == rating.movie).unwrap();
let j = users.iter().position(|&x| x == rating.user).unwrap();
Y[i][j] = rating.rating();
R[i][j] = 1.0;
}
let R = R.iter().flatten().copied().collect::<Vec<f32>>();
let R = Tensor::from_slice(&R, (n_movies, n_users), &device)?;
let Y = Y.iter().flatten().copied().collect::<Vec<f32>>();
let Y = Tensor::from_slice(&Y, (n_movies, n_users), &device)?;
let Y = mean_normalization(&Y, &R)?;
let mut X = Tensor::randn(0f32, 0.1, (n_movies, args.n_features), &device)?;
let mut W = Tensor::randn(0f32, 0.1, (n_users, args.n_features), &device)?;
for i in 0..args.epochs {
let diff = X.matmul(&W.t()?)?.mul(&R)?.sub(&Y.mul(&R)?)?;
let grad_X = diff.matmul(&W)?.add(&X.broadcast_mul(&reg)?)?;
let grad_W = diff.t()?.matmul(&X)?.add(&W.broadcast_mul(&reg)?)?;
X = X.sub(&grad_X.broadcast_mul(&lr)?)?;
W = W.sub(&grad_W.broadcast_mul(&lr)?)?;
}
// Load movie titles
let mut rdr = csv::Reader::from_path(&args.movies_csv)?;
let mut movie_titles = HashMap::new();
for result in rdr.records() {
let record = result?;
let movie_id: u32 = record[0].parse()?;
let title = record[1].to_string();
movie_titles.insert(movie_id, title);
}
// Choose a random movie and find similar movies
let mut rng = thread_rng();
let random_movie_id = movies.choose(&mut rng).unwrap();
println!("Random movie: {}", movie_titles[random_movie_id]);
let random_movie_idx = movies.iter().position(|&x| x == *random_movie_id).unwrap();
let random_index_tensor = Tensor::from_slice(&[random_movie_idx as u32], &[1], &device)?;
let random_movie_features = X.index_select(&random_index_tensor, 0)?;
let mut movie_distances: Vec<MovieDistance> = Vec::new();
for i in 0..n_movies {
let movie_index_tensor = Tensor::from_slice(&[i as u32], &[1], &device)?;
let movie_features = X.index_select(&movie_index_tensor, 0)?;
let dist = cdist(&random_movie_features, &movie_features)?;
let dist = dist.to_scalar::<f32>()?;
let movie_distance = MovieDistance::new(movies[i], (dist * 1000.0) as u32);
movie_distances.push(movie_distance);
}
movie_distances.sort();
for i in 0..10 {
let movie_id = movie_distances[i].id;
let distance = movie_distances[i].distance;
println!(
"{}: {} (distance: {})",
i + 1,
movie_titles[&movie_id],
distance
);
}
Ok(())
}