mirror of
https://github.com/vishpat/candle-coursera-ml.git
synced 2025-12-22 22:19:58 +00:00
Inital commit
This commit is contained in:
5
README.md
Normal file
5
README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
## Machine Learning with Rust using Candle
|
||||
|
||||
This repository features implementations of algorithms from the Stanford University [Machine Learning Course](https://www.youtube.com/@machinelearningandai3274), all crafted in Rust using the [Candle](https://github.com/huggingface/candle) crate. Each example leverages diverse datasets from Kaggle to demonstrate the algorithms' applications.
|
||||
|
||||
All of the examples are CUDA enabled but can run on a machine without a GPU as well, though might be quite slow.
|
||||
13
anamoly-detection/Cargo.toml
Normal file
13
anamoly-detection/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "anamoly-detection"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
csv = "1.1.6"
|
||||
anyhow = "1.0.40"
|
||||
clap = {version = "4.3.1", features = ["derive"]}
|
||||
rand = "0.8.5"
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }
|
||||
3
anamoly-detection/README.md
Normal file
3
anamoly-detection/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Anamoly Detection
|
||||
|
||||
[Anamoly Detection](https://youtu.be/UqqPm-Q4aMo?si=TCZFJOJv94R1i71u) using Gaussian Distribution for the Kaggle [EECS 498 dataset](https://www.kaggle.com/c/eecs498/data).
|
||||
100
anamoly-detection/src/main.rs
Normal file
100
anamoly-detection/src/main.rs
Normal file
@@ -0,0 +1,100 @@
|
||||
extern crate csv;
|
||||
use std::vec;
|
||||
|
||||
use anyhow::Result;
|
||||
use candle_core::{Device, Tensor};
|
||||
use clap::Parser;
|
||||
|
||||
fn load_dataset(file_path: &str, device: &Device) -> Result<Tensor> {
|
||||
let mut rdr = csv::Reader::from_path(file_path)?;
|
||||
let mut data = Vec::new();
|
||||
for result in rdr.records() {
|
||||
let record = result?;
|
||||
let mut row = vec![];
|
||||
for i in 1..4 {
|
||||
row.push(record[i].parse::<f64>()?);
|
||||
}
|
||||
data.push(row);
|
||||
}
|
||||
let feature_cnt = data[0].len();
|
||||
let sample_cnt = data.len();
|
||||
let data = data.into_iter().flatten().collect::<Vec<_>>();
|
||||
let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?;
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
fn z_score_normalize(data: &Tensor) -> Result<Tensor> {
|
||||
let mean = data.mean(0)?;
|
||||
let squared_diff = data.broadcast_sub(&mean)?.sqr()?;
|
||||
let variance = squared_diff.mean(0)?;
|
||||
let std_dev = variance.sqrt()?;
|
||||
let normalized = data.broadcast_sub(&mean)?.broadcast_div(&std_dev)?;
|
||||
Ok(normalized)
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
// Data CSV file from https://www.kaggle.com/c/eecs498/data
|
||||
#[arg(long)]
|
||||
data_csv: String,
|
||||
|
||||
#[arg(long, short, default_value = "false")]
|
||||
print: bool,
|
||||
|
||||
#[arg(long, default_value = "0.001")]
|
||||
episilon: f64,
|
||||
}
|
||||
|
||||
fn p_x(
|
||||
x: &Tensor,
|
||||
mean: &Tensor,
|
||||
two_variance: &Tensor,
|
||||
two_pi_sqrt_std_dev: &Tensor,
|
||||
) -> Result<f64> {
|
||||
let px = x
|
||||
.broadcast_sub(mean)?
|
||||
.sqr()?
|
||||
.broadcast_div(two_variance)?
|
||||
.exp()?
|
||||
.broadcast_mul(two_pi_sqrt_std_dev)?
|
||||
.recip()?;
|
||||
let px = px.to_vec1::<f64>()?.into_iter().fold(1.0, |acc, x| acc * x);
|
||||
Ok(px)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
let device = Device::cuda_if_available(0)?;
|
||||
let data = load_dataset(&args.data_csv, &device)?;
|
||||
|
||||
let data = z_score_normalize(&data)?;
|
||||
|
||||
let mean = data.mean(0)?;
|
||||
let variance = data.broadcast_sub(&mean)?.sqr()?.mean(0)?;
|
||||
let std_dev = variance.sqrt()?;
|
||||
|
||||
let two_variance = variance.broadcast_mul(&Tensor::new(2.0, &device)?)?;
|
||||
let two_pi_sqrt_std_dev =
|
||||
std_dev.broadcast_mul(&Tensor::new(2.0 * std::f64::consts::PI, &device)?.sqrt()?)?;
|
||||
|
||||
let rows = data.shape().dims2()?.0;
|
||||
let mut anamolies = 0;
|
||||
for row in 0..rows {
|
||||
let row_tensor = data
|
||||
.index_select(&Tensor::new(&[row as u32], &device)?, 0)?
|
||||
.squeeze(0)?;
|
||||
let px = p_x(&row_tensor, &mean, &two_variance, &two_pi_sqrt_std_dev)?;
|
||||
if px < args.episilon {
|
||||
anamolies += 1;
|
||||
if args.print {
|
||||
println!("Anamoly: {}", row + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("Anamolies: {}, Total: {}", anamolies, rows);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
13
k-means/Cargo.toml
Normal file
13
k-means/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "k-means"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
csv = "1.1.6"
|
||||
anyhow = "1.0.40"
|
||||
clap = {version = "4.3.1", features = ["derive"]}
|
||||
rand = "0.8.5"
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }
|
||||
3
k-means/README.md
Normal file
3
k-means/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# K-Means clustering
|
||||
|
||||
[K-Means clustering](https://youtu.be/0D4LnsJr85Y?si=qmkMIqY39rMJUBDk) for the Kaggle [Irisi dataset](https://www.kaggle.com/datasets/uciml/iris/data)
|
||||
106
k-means/src/main.rs
Normal file
106
k-means/src/main.rs
Normal file
@@ -0,0 +1,106 @@
|
||||
extern crate csv;
|
||||
use std::vec;
|
||||
|
||||
use anyhow::Result;
|
||||
use candle_core::{DType, Device, Tensor, D};
|
||||
use clap::Parser;
|
||||
use rand::prelude::*;
|
||||
|
||||
fn cdist(x1: &Tensor, x2: &Tensor) -> Result<Tensor> {
|
||||
let x1 = x1.unsqueeze(0)?;
|
||||
let x2 = x2.unsqueeze(1)?;
|
||||
Ok(x1
|
||||
.broadcast_sub(&x2)?
|
||||
.sqr()?
|
||||
.sum(D::Minus1)?
|
||||
.sqrt()?
|
||||
.transpose(D::Minus1, D::Minus2)?)
|
||||
}
|
||||
|
||||
fn load_dataset(file_path: &str, device: &Device) -> Result<Tensor> {
|
||||
let mut rdr = csv::Reader::from_path(file_path)?;
|
||||
let mut data = Vec::new();
|
||||
for result in rdr.records() {
|
||||
let record = result?;
|
||||
let mut row = vec![];
|
||||
for i in 1..5 {
|
||||
row.push(record[i].parse::<f64>()?);
|
||||
}
|
||||
data.push(row);
|
||||
}
|
||||
let feature_cnt = data[0].len();
|
||||
let sample_cnt = data.len();
|
||||
let data = data.into_iter().flatten().collect::<Vec<_>>();
|
||||
let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?;
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
fn k_means(data: &Tensor, k: usize, max_iter: i64, device: &Device) -> Result<(Tensor, Tensor)> {
|
||||
let (n, _) = data.dims2()?;
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut indices = (0..n).collect::<Vec<_>>();
|
||||
indices.shuffle(&mut rng);
|
||||
|
||||
let centroid_idx = indices[..k]
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|x| x as i64)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let centroid_idx_tensor = Tensor::from_slice(centroid_idx.as_slice(), (k,), device)?;
|
||||
let mut centers = data.index_select(¢roid_idx_tensor, 0)?;
|
||||
let mut cluster_assignments = Tensor::zeros((n,), DType::U32, device)?;
|
||||
for _ in 0..max_iter {
|
||||
let dist = cdist(data, ¢ers)?;
|
||||
cluster_assignments = dist.argmin(D::Minus1)?;
|
||||
let mut centers_vec = vec![];
|
||||
for i in 0..k {
|
||||
let mut indices = vec![];
|
||||
cluster_assignments
|
||||
.to_vec1::<u32>()?
|
||||
.iter()
|
||||
.enumerate()
|
||||
.for_each(|(j, x)| {
|
||||
if *x == i as u32 {
|
||||
indices.push(j as u32);
|
||||
}
|
||||
});
|
||||
let indices = Tensor::from_slice(indices.as_slice(), (indices.len(),), device)?;
|
||||
let cluster_data = data.index_select(&indices, 0)?;
|
||||
let mean = cluster_data.mean(0)?;
|
||||
centers_vec.push(mean);
|
||||
}
|
||||
centers = Tensor::stack(centers_vec.as_slice(), 0)?;
|
||||
}
|
||||
Ok((centers, cluster_assignments))
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
// Data CSV file from https://www.kaggle.com/datasets/uciml/iris/data
|
||||
#[arg(long)]
|
||||
data_csv: String,
|
||||
|
||||
// Number of clusters
|
||||
#[arg(long, default_value = "3")]
|
||||
k: usize,
|
||||
|
||||
// Maximum number of iterations
|
||||
#[arg(long, default_value = "100")]
|
||||
max_iter: i64,
|
||||
}
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
let device = Device::cuda_if_available(0)?;
|
||||
let data = load_dataset(&args.data_csv, &device).unwrap();
|
||||
let (centers, cluster_assignments) = k_means(&data, args.k, args.max_iter, &device)?;
|
||||
println!("{}", centers);
|
||||
println!("{}", cluster_assignments);
|
||||
let cluster_sizes = cluster_assignments.to_vec1::<u32>()?;
|
||||
for i in 0..args.k {
|
||||
let size = cluster_sizes.iter().filter(|&&x| x == i as u32).count();
|
||||
println!("Cluster {} size: {}", i, size);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
14
linear-regression/Cargo.toml
Normal file
14
linear-regression/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "linear-regression"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
csv = "1.1.6"
|
||||
ndarray = "0.15.3"
|
||||
candle = { git = "https://github.com/huggingface/candle", package = "candle-core", features = ["cuda"]}
|
||||
anyhow = "1.0.40"
|
||||
clap = { version = "4.5.1", features = ["derive"] }
|
||||
rand = "0.8.5"
|
||||
7
linear-regression/README.md
Normal file
7
linear-regression/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Linear regression
|
||||
|
||||
[Linear regression](https://youtu.be/W46UTQ_JDPk?si=dfz9_kFBUkM3E1RR) (with regularization) model using gradient descent implemented for the Kaggle [insurance dataset](https://www.kaggle.com/code/kianwee/linear-regression-insurance-dataset).
|
||||
|
||||
```bash
|
||||
cargo run -- --data-csv ./insurance.csv
|
||||
```
|
||||
245
linear-regression/src/main.rs
Normal file
245
linear-regression/src/main.rs
Normal file
@@ -0,0 +1,245 @@
|
||||
extern crate csv;
|
||||
use anyhow::Result;
|
||||
use candle::{Device, Tensor, D};
|
||||
use clap::Parser;
|
||||
use core::panic;
|
||||
use rand::prelude::*;
|
||||
use std::fs::File;
|
||||
use std::rc::Rc;
|
||||
|
||||
struct Dataset {
|
||||
pub training_data: Tensor,
|
||||
pub training_labels: Tensor,
|
||||
pub test_data: Tensor,
|
||||
pub test_labels: Tensor,
|
||||
pub feature_cnt: usize,
|
||||
}
|
||||
|
||||
// Implement Linear Regression model using Gradient Descent
|
||||
// https://www.youtube.com/watch?v=UVCFaaEBnTE
|
||||
struct LinearRegression {
|
||||
weights: Tensor,
|
||||
bias: Tensor,
|
||||
device: Rc<Device>,
|
||||
}
|
||||
|
||||
impl LinearRegression {
|
||||
fn new(feature_cnt: usize, device: Rc<Device>) -> Result<Self> {
|
||||
let weights: Vec<f32> = vec![0.0; feature_cnt];
|
||||
let weights = Tensor::from_vec(weights, (feature_cnt,), &device)?;
|
||||
let bias = Tensor::new(0.0f32, &device)?;
|
||||
Ok(Self {
|
||||
weights,
|
||||
bias,
|
||||
device,
|
||||
})
|
||||
}
|
||||
|
||||
fn hypothesis(&self, x: &Tensor) -> Result<Tensor> {
|
||||
Ok(x.matmul(&self.weights.unsqueeze(1)?)?
|
||||
.squeeze(1)?
|
||||
.broadcast_add(&self.bias)?)
|
||||
}
|
||||
|
||||
fn loss(&self, y1: &Tensor, y2: &Tensor) -> Result<f32> {
|
||||
let diff = y1.sub(y2)?;
|
||||
let loss = diff.mul(&diff)?.mean(D::Minus1)?.to_scalar::<f32>()?;
|
||||
Ok(loss)
|
||||
}
|
||||
|
||||
fn train(
|
||||
&mut self,
|
||||
x: &Tensor,
|
||||
y: &Tensor,
|
||||
learning_rate: f32,
|
||||
regularization: f32,
|
||||
) -> Result<()> {
|
||||
let m = y.shape().dims1()?;
|
||||
let predictions = self.hypothesis(x)?;
|
||||
let deltas = predictions.sub(y)?;
|
||||
let regularization = self
|
||||
.weights
|
||||
.broadcast_mul(&Tensor::new(regularization / m as f32, &self.device)?)?;
|
||||
|
||||
let gradient = x
|
||||
.t()?
|
||||
.matmul(&deltas.unsqueeze(D::Minus1)?)?
|
||||
.broadcast_div(&Tensor::new(m as f32, &self.device)?)?;
|
||||
let gradient = gradient
|
||||
.squeeze(D::Minus1)?
|
||||
.squeeze(D::Minus1)?
|
||||
.add(®ularization)?;
|
||||
self.weights = self
|
||||
.weights
|
||||
.sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?;
|
||||
let gradient = deltas.mean(D::Minus1)?;
|
||||
self.bias = self
|
||||
.bias
|
||||
.sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn r2_score(predictions: &Tensor, labels: &Tensor) -> Result<f32, Box<dyn std::error::Error>> {
|
||||
let mean = labels.mean(D::Minus1)?;
|
||||
|
||||
let ssr = labels.sub(predictions)?;
|
||||
let ssr = ssr.mul(&ssr)?.sum(D::Minus1)?;
|
||||
|
||||
let sst = labels.broadcast_sub(&mean)?;
|
||||
let sst = sst.mul(&sst)?.sum(D::Minus1)?;
|
||||
|
||||
let tmp = ssr.div(&sst)?.to_scalar::<f32>()?;
|
||||
|
||||
Ok(1.0 - tmp)
|
||||
}
|
||||
|
||||
const BATCH_SIZE: usize = 100;
|
||||
|
||||
fn insurance_dataset(file_path: &str, device: &Device) -> Result<Dataset> {
|
||||
// https://www.kaggle.com/mirichoi0218/insurance
|
||||
|
||||
let file = File::open(file_path)?;
|
||||
let mut rdr = csv::Reader::from_reader(file);
|
||||
let mut data: Vec<Vec<f32>> = vec![];
|
||||
let mut labels: Vec<f32> = vec![];
|
||||
|
||||
const FEATURE_CNT: usize = 6;
|
||||
const MALE: f32 = 0.5;
|
||||
const FEMALE: f32 = -0.5;
|
||||
|
||||
const YES: f32 = 0.5;
|
||||
const NO: f32 = -0.5;
|
||||
|
||||
const NORTHWEST: f32 = 0.25;
|
||||
const NORTHEAST: f32 = -0.25;
|
||||
const SOUTHWEST: f32 = 0.5;
|
||||
const SOUTHEAST: f32 = -0.5;
|
||||
|
||||
for result in rdr.records() {
|
||||
let record = result?;
|
||||
let age: f32 = (record[0].parse::<u32>()? as f32) / 100.0;
|
||||
let gender = match record[1].parse::<String>()?.as_str() {
|
||||
"male" => MALE,
|
||||
"female" => FEMALE,
|
||||
_ => panic!("Invalid Gender"),
|
||||
};
|
||||
let bmi: f32 = record[2].parse::<f32>()? / 100.0;
|
||||
let children: f32 = record[3].parse()?;
|
||||
let smoker = match record[4].parse::<String>()?.as_str() {
|
||||
"yes" => YES,
|
||||
"no" => NO,
|
||||
_ => panic!("Invalid Smoker"),
|
||||
};
|
||||
let region = match record[5].parse::<String>()?.as_str() {
|
||||
"northwest" => NORTHWEST,
|
||||
"northeast" => NORTHEAST,
|
||||
"southwest" => SOUTHWEST,
|
||||
"southeast" => SOUTHEAST,
|
||||
_ => panic!("Invalid Region"),
|
||||
};
|
||||
let charges: f32 = record[6].parse()?;
|
||||
|
||||
let row = vec![age, gender, bmi, children, smoker, region];
|
||||
data.push(row);
|
||||
|
||||
let label = charges;
|
||||
labels.push(label);
|
||||
}
|
||||
let training_size = labels.len() * 8 / 10;
|
||||
let training_data = data[..training_size].to_vec();
|
||||
let training_labels = labels[..training_size].to_vec();
|
||||
|
||||
let training_data = training_data
|
||||
.iter()
|
||||
.flatten()
|
||||
.copied()
|
||||
.collect::<Vec<f32>>();
|
||||
let training_data_tensor =
|
||||
Tensor::from_slice(&training_data, (training_labels.len(), FEATURE_CNT), device)?;
|
||||
let training_labels_tensor =
|
||||
Tensor::from_slice(&training_labels, (training_labels.len(),), device)?;
|
||||
|
||||
let test_data = data[training_size..].to_vec();
|
||||
let test_labels = labels[training_size..].to_vec();
|
||||
|
||||
let test_data = test_data.iter().flatten().copied().collect::<Vec<f32>>();
|
||||
let test_data_tensor =
|
||||
Tensor::from_slice(&test_data, (test_labels.len(), FEATURE_CNT), device)?;
|
||||
let test_labels_tensor = Tensor::from_slice(&test_labels, (test_labels.len(),), device)?;
|
||||
|
||||
Ok(Dataset {
|
||||
training_data: training_data_tensor,
|
||||
training_labels: training_labels_tensor,
|
||||
test_data: test_data_tensor,
|
||||
test_labels: test_labels_tensor,
|
||||
feature_cnt: FEATURE_CNT,
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
#[arg(long)]
|
||||
data_csv: String,
|
||||
|
||||
// Print the Cost and Loss at each epoch
|
||||
#[arg(long, default_value_t = false)]
|
||||
progress: bool,
|
||||
|
||||
// The learning rate
|
||||
#[arg(long, default_value = "0.01")]
|
||||
learning_rate: f32,
|
||||
|
||||
// The regularization parameter
|
||||
#[arg(long, default_value = "0.01")]
|
||||
regularization: f32,
|
||||
|
||||
// The number of epochs
|
||||
#[arg(long, default_value = "10000")]
|
||||
epochs: i32,
|
||||
}
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
let file_path = args.data_csv;
|
||||
|
||||
let device = Rc::new(Device::cuda_if_available(0)?);
|
||||
let dataset = insurance_dataset(&file_path, &device)?;
|
||||
|
||||
let mut model = LinearRegression::new(dataset.feature_cnt, device)?;
|
||||
let (training_size, _) = dataset.training_data.shape().dims2()?;
|
||||
let n_batches = training_size / BATCH_SIZE;
|
||||
let mut batch_idxs = (0..n_batches).collect::<Vec<usize>>();
|
||||
|
||||
for epoch in 0..args.epochs {
|
||||
let mut sum_loss = 0.0;
|
||||
batch_idxs.shuffle(&mut rand::thread_rng());
|
||||
for batch_idx in batch_idxs.iter() {
|
||||
let train_data = dataset
|
||||
.training_data
|
||||
.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?;
|
||||
let train_labels =
|
||||
dataset
|
||||
.training_labels
|
||||
.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?;
|
||||
model.train(
|
||||
&train_data,
|
||||
&train_labels,
|
||||
args.learning_rate,
|
||||
args.regularization,
|
||||
)?;
|
||||
let predictions = model.hypothesis(&train_data)?;
|
||||
let loss = model.loss(&predictions, &train_labels)?;
|
||||
sum_loss += loss;
|
||||
}
|
||||
if args.progress && epoch % 1000 == 0 {
|
||||
let predictions = model.hypothesis(&dataset.test_data)?;
|
||||
let r2 = r2_score(&predictions, &dataset.test_labels).unwrap();
|
||||
println!("epoch: {epoch}, loss: {}, accuracy: {}", sum_loss / n_batches as f32, r2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
15
logistic-regression/Cargo.toml
Normal file
15
logistic-regression/Cargo.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "logistic-regression"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
csv = "1.1.6"
|
||||
ndarray = "0.15.3"
|
||||
anyhow = "1.0.40"
|
||||
clap = {version = "4.3.1", features = ["derive"]}
|
||||
rand = "0.8.5"
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }
|
||||
candle-datasets = { git = "https://github.com/huggingface/candle.git", version = "0.4.1" }
|
||||
7
logistic-regression/README.md
Normal file
7
logistic-regression/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Logistic Regression
|
||||
|
||||
[Logistic regression](https://youtu.be/4u81xU7BIOc?si=7ZSLIqS-bzrBZWgL) (with regularization) model using gradient descent implemented for the MNIST [dataset](https://www.kaggle.com/datasets/hojjatk/mnist-dataset) to distinguish between zero vs non-zero digits.
|
||||
|
||||
```bash
|
||||
cargo run
|
||||
```
|
||||
177
logistic-regression/src/main.rs
Normal file
177
logistic-regression/src/main.rs
Normal file
@@ -0,0 +1,177 @@
|
||||
extern crate csv;
|
||||
use anyhow::Result;
|
||||
use candle_core::{Device, Tensor, D};
|
||||
use clap::Parser;
|
||||
use rand::prelude::*;
|
||||
use std::rc::Rc;
|
||||
|
||||
// Implement Logistic Regression model using Gradient Descent
|
||||
// https://www.youtube.com/watch?v=4u81xU7BIOc
|
||||
struct LogisticRegression {
|
||||
weights: Tensor,
|
||||
bias: Tensor,
|
||||
device: Rc<Device>,
|
||||
}
|
||||
|
||||
fn sigmoid(xs: &Tensor) -> Result<Tensor> {
|
||||
Ok((xs.neg()?.exp()? + 1.0)?.recip()?)
|
||||
}
|
||||
|
||||
impl LogisticRegression {
|
||||
fn new(feature_cnt: usize, device: Rc<Device>) -> Result<Self> {
|
||||
let weights: Vec<f32> = vec![0.0; feature_cnt];
|
||||
let weights = Tensor::from_vec(weights, (feature_cnt,), &device)?;
|
||||
let bias = Tensor::new(0.0f32, &device)?;
|
||||
Ok(Self {
|
||||
weights,
|
||||
bias,
|
||||
device,
|
||||
})
|
||||
}
|
||||
|
||||
fn hypothesis(&self, x: &Tensor) -> Result<Tensor> {
|
||||
Ok(sigmoid(
|
||||
&x.matmul(&self.weights.unsqueeze(1)?)?
|
||||
.squeeze(1)?
|
||||
.broadcast_add(&self.bias)?,
|
||||
)?)
|
||||
}
|
||||
|
||||
fn loss(&self, y1: &Tensor, y2: &Tensor) -> Result<f32> {
|
||||
let diff = y1.sub(y2)?;
|
||||
let loss = diff.mul(&diff)?.mean(D::Minus1)?.to_scalar::<f32>()?;
|
||||
Ok(loss)
|
||||
}
|
||||
|
||||
fn train(
|
||||
&mut self,
|
||||
x: &Tensor,
|
||||
y: &Tensor,
|
||||
learning_rate: f32,
|
||||
regularization: f32,
|
||||
) -> Result<()> {
|
||||
let m = y.shape().dims1()?;
|
||||
let predictions = self.hypothesis(x)?;
|
||||
let deltas = predictions.sub(y)?;
|
||||
let regularization = self
|
||||
.weights
|
||||
.broadcast_mul(&Tensor::new(regularization / m as f32, &self.device)?)?;
|
||||
|
||||
let gradient = x
|
||||
.t()?
|
||||
.matmul(&deltas.unsqueeze(D::Minus1)?)?
|
||||
.broadcast_div(&Tensor::new(m as f32, &self.device)?)?;
|
||||
let gradient = gradient
|
||||
.squeeze(D::Minus1)?
|
||||
.squeeze(D::Minus1)?
|
||||
.add(®ularization)?;
|
||||
|
||||
self.weights = self
|
||||
.weights
|
||||
.sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?;
|
||||
let gradient = deltas.mean(D::Minus1)?;
|
||||
self.bias = self
|
||||
.bias
|
||||
.sub(&gradient.broadcast_mul(&Tensor::new(learning_rate, &self.device)?)?)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
const BATCH_SIZE: usize = 100;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
// Print the Cost and Loss at each epoch
|
||||
#[arg(long, default_value_t = false)]
|
||||
progress: bool,
|
||||
// The learning rate
|
||||
#[arg(long, default_value = "0.01")]
|
||||
learning_rate: f32,
|
||||
|
||||
// Regularization parameter
|
||||
#[arg(long, default_value = "0.1")]
|
||||
regularization: f32,
|
||||
|
||||
// The number of epochs
|
||||
#[arg(long, default_value = "10000")]
|
||||
epochs: i32,
|
||||
|
||||
// The digit to classify
|
||||
#[arg(long, default_value = "0")]
|
||||
digit: u8,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
let device = Rc::new(Device::cuda_if_available(0)?);
|
||||
|
||||
let dataset = candle_datasets::vision::mnist::load()?;
|
||||
let (_, n) = dataset.train_images.shape().dims2()?;
|
||||
let training_images = dataset.train_images.to_device(&device)?;
|
||||
let training_labels = dataset.train_labels.to_device(&device)?;
|
||||
let training_labels_vec = training_labels
|
||||
.to_vec1::<u8>()?
|
||||
.into_iter()
|
||||
.map(|x| if x == args.digit { 1.0 } else { 0.0 })
|
||||
.collect::<Vec<f32>>();
|
||||
let len = training_labels_vec.len();
|
||||
let training_labels = Tensor::from_vec(training_labels_vec, (len,), &device)?;
|
||||
|
||||
let test_images = dataset.test_images.to_device(&device)?;
|
||||
let test_labels = dataset.test_labels.to_device(&device)?;
|
||||
let test_labels_vec = test_labels
|
||||
.to_vec1::<u8>()?
|
||||
.into_iter()
|
||||
.map(|x| if x == args.digit { 1f32 } else { 0f32 })
|
||||
.collect::<Vec<f32>>();
|
||||
let len = test_labels_vec.len();
|
||||
let test_labels = Tensor::from_vec(test_labels_vec, (len,), &device)?;
|
||||
|
||||
let mut model = LogisticRegression::new(n, device.clone())?;
|
||||
let (training_size, _) = training_images.shape().dims2()?;
|
||||
let n_batches = training_size / BATCH_SIZE;
|
||||
let mut batch_idxs = (0..n_batches).collect::<Vec<usize>>();
|
||||
|
||||
for epoch in 0..args.epochs {
|
||||
let mut sum_loss = 0.0;
|
||||
batch_idxs.shuffle(&mut rand::thread_rng());
|
||||
for batch_idx in batch_idxs.iter() {
|
||||
let train_data = training_images.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?;
|
||||
let train_labels = training_labels.narrow(0, batch_idx * BATCH_SIZE, BATCH_SIZE)?;
|
||||
model.train(
|
||||
&train_data,
|
||||
&train_labels,
|
||||
args.learning_rate,
|
||||
args.regularization,
|
||||
)?;
|
||||
let predictions = model.hypothesis(&train_data)?;
|
||||
let loss = model.loss(&predictions, &train_labels)?;
|
||||
sum_loss += loss;
|
||||
}
|
||||
if args.progress && epoch % 1000 == 0 {
|
||||
let predictions = model.hypothesis(&test_images)?;
|
||||
let predictions_vec = predictions
|
||||
.to_vec1::<f32>()?
|
||||
.into_iter()
|
||||
.map(|x| if x > 0.5 { 1f32 } else { 0f32 })
|
||||
.collect::<Vec<f32>>();
|
||||
let predictions = Tensor::from_vec(predictions_vec, (len,), &device)?;
|
||||
|
||||
let accuracy = predictions
|
||||
.eq(&test_labels)?
|
||||
.to_vec1::<u8>()?
|
||||
.into_iter()
|
||||
.map(f32::from)
|
||||
.sum::<f32>()
|
||||
/ len as f32;
|
||||
println!(
|
||||
"epoch: {epoch}, loss: {}, Test Accuracy: {}",
|
||||
sum_loss / n_batches as f32,
|
||||
accuracy
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
14
neural-networks/Cargo.toml
Normal file
14
neural-networks/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "neural-networks"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.40"
|
||||
csv = "1.1.6"
|
||||
clap = { version = "4.5.1", features = ["derive"] }
|
||||
rand = "0.8.5"
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"]}
|
||||
candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.4.1" }
|
||||
3
neural-networks/README.md
Normal file
3
neural-networks/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
## Multi-Classifier using Neural Networks
|
||||
|
||||
A simple two hidden layer [Neural Network](https://youtu.be/UVjj2fHu9YU?si=R8-wuF1QAYDK_SGy) that can classify the images from [MNIST Fashion dataset](https://www.kaggle.com/datasets/zalando-research/fashionmnist).
|
||||
135
neural-networks/src/main.rs
Normal file
135
neural-networks/src/main.rs
Normal file
@@ -0,0 +1,135 @@
|
||||
use anyhow::Result;
|
||||
use candle_core::{DType, Device, Tensor, D};
|
||||
use candle_nn::{loss, ops, Linear, Module, Optimizer, VarBuilder, VarMap};
|
||||
use clap::Parser;
|
||||
use std::rc::Rc;
|
||||
|
||||
const IMAGE_DIM: usize = 28 * 28;
|
||||
const LABELS: usize = 10;
|
||||
|
||||
struct Dataset {
|
||||
pub training_data: Tensor,
|
||||
pub training_labels: Tensor,
|
||||
pub test_data: Tensor,
|
||||
pub test_labels: Tensor,
|
||||
}
|
||||
|
||||
fn load_tensors(csv: &str, device: &Device) -> Result<(Tensor, Tensor)> {
|
||||
let mut data = Vec::new();
|
||||
let mut labels = Vec::new();
|
||||
|
||||
let mut rdr = csv::Reader::from_path(csv)?;
|
||||
for result in rdr.records() {
|
||||
let record = result?;
|
||||
let label = record.get(0).unwrap().parse::<u32>()?;
|
||||
let mut features = Vec::new();
|
||||
for i in 1..record.len() {
|
||||
features.push(record.get(i).unwrap().parse::<f32>()? / 255.0);
|
||||
}
|
||||
labels.push(label);
|
||||
data.push(features);
|
||||
}
|
||||
|
||||
let data = data.into_iter().flatten().collect::<Vec<f32>>();
|
||||
let data = Tensor::from_slice(&data, (labels.len(), IMAGE_DIM), device)?;
|
||||
let labels = Tensor::from_slice(&labels, (labels.len(),), device)?;
|
||||
|
||||
Ok((data, labels))
|
||||
}
|
||||
|
||||
fn load_dataset(train_csv: &str, test_csv: &str, device: &Device) -> Result<Dataset> {
|
||||
let (training_data, training_labels) = load_tensors(train_csv, device)?;
|
||||
let (test_data, test_labels) = load_tensors(test_csv, device)?;
|
||||
|
||||
Ok(Dataset {
|
||||
training_data,
|
||||
training_labels,
|
||||
test_data,
|
||||
test_labels,
|
||||
})
|
||||
}
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
#[arg(long)]
|
||||
train_csv: String,
|
||||
|
||||
#[arg(long)]
|
||||
test_csv: String,
|
||||
|
||||
// Print the Cost and Loss at each epoch
|
||||
#[arg(long, default_value_t = false)]
|
||||
progress: bool,
|
||||
|
||||
// The learning rate
|
||||
#[arg(long, default_value = "0.01")]
|
||||
learning_rate: f64,
|
||||
|
||||
// The regularization parameter
|
||||
#[arg(long, default_value = "0.01")]
|
||||
regularization: f32,
|
||||
|
||||
// The number of epochs
|
||||
#[arg(long, default_value = "5000")]
|
||||
epochs: i32,
|
||||
}
|
||||
|
||||
struct Mlp {
|
||||
ln1: Linear,
|
||||
ln2: Linear,
|
||||
}
|
||||
|
||||
impl Mlp {
|
||||
fn new(vs: VarBuilder) -> Result<Self> {
|
||||
let ln1 = candle_nn::linear(IMAGE_DIM, 100, vs.pp("ln1"))?;
|
||||
let ln2 = candle_nn::linear(100, LABELS, vs.pp("ln2"))?;
|
||||
Ok(Self { ln1, ln2 })
|
||||
}
|
||||
|
||||
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||
let xs = self.ln1.forward(xs)?;
|
||||
let xs = xs.relu()?;
|
||||
Ok(self.ln2.forward(&xs)?)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
let device = Rc::new(Device::cuda_if_available(0)?);
|
||||
let dataset = load_dataset(&args.train_csv, &args.test_csv, &device)?;
|
||||
|
||||
let varmap = VarMap::new();
|
||||
let vs = VarBuilder::from_varmap(&varmap, DType::F32, &device);
|
||||
let model = Mlp::new(vs)?;
|
||||
let mut sgd = candle_nn::SGD::new(varmap.all_vars(), args.learning_rate)?;
|
||||
|
||||
let test_images = dataset.test_data.to_device(&device)?;
|
||||
let test_labels = dataset
|
||||
.test_labels
|
||||
.to_dtype(DType::U32)?
|
||||
.to_device(&device)?;
|
||||
for epoch in 1..args.epochs {
|
||||
let logits = model.forward(&dataset.training_data)?;
|
||||
let log_sm = ops::log_softmax(&logits, D::Minus1)?;
|
||||
let loss = loss::nll(&log_sm, &dataset.training_labels)?;
|
||||
sgd.backward_step(&loss)?;
|
||||
|
||||
let test_logits = model.forward(&test_images)?;
|
||||
let sum_ok = test_logits
|
||||
.argmax(D::Minus1)?
|
||||
.eq(&test_labels)?
|
||||
.to_dtype(DType::F32)?
|
||||
.sum_all()?
|
||||
.to_scalar::<f32>()?;
|
||||
let test_accuracy = sum_ok / test_labels.dims1()? as f32;
|
||||
if args.progress && epoch % 100 == 0 {
|
||||
println!(
|
||||
"{epoch:4} train loss: {:8.5} test acc: {:5.2}%",
|
||||
loss.to_scalar::<f32>()?,
|
||||
100. * test_accuracy
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
13
pca/Cargo.toml
Normal file
13
pca/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "pca"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
nalgebra = "0.32.4"
|
||||
csv = "1.1.6"
|
||||
anyhow = "1.0.40"
|
||||
clap = {version = "4.3.1", features = ["derive"]}
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }
|
||||
3
pca/README.md
Normal file
3
pca/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Principal Component Analysis
|
||||
|
||||
Implementation of the [PCA algorithm](https://youtu.be/pAwjiGkafbM?si=BBsViJAkIGD89_Ub) to reduce the dimensions of the Breast Cancer Wisconsin (Diagnostic) Data Set [dataset](https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data)
|
||||
100
pca/src/main.rs
Normal file
100
pca/src/main.rs
Normal file
@@ -0,0 +1,100 @@
|
||||
use anyhow::{Ok, Result};
|
||||
use candle_core::{Device, Tensor, D};
|
||||
use clap::Parser;
|
||||
use nalgebra::linalg::SymmetricEigen;
|
||||
use nalgebra::DMatrix;
|
||||
|
||||
fn load_dataset(file_path: &str, device: &Device) -> Result<Tensor> {
|
||||
let mut rdr = csv::Reader::from_path(file_path)?;
|
||||
let mut data = Vec::new();
|
||||
for result in rdr.records() {
|
||||
let record = result?;
|
||||
let mut row = Vec::new();
|
||||
for i in 2..32 {
|
||||
let value = record[i].parse::<f32>()?;
|
||||
row.push(value);
|
||||
}
|
||||
data.push(row);
|
||||
}
|
||||
let feature_cnt = data[0].len();
|
||||
let sample_cnt = data.len();
|
||||
let data = data.into_iter().flatten().collect::<Vec<_>>();
|
||||
let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?;
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
fn z_score_normalize(data: &Tensor) -> Result<Tensor> {
|
||||
let mean = data.mean(0)?;
|
||||
let squared_diff = data.broadcast_sub(&mean)?.sqr()?;
|
||||
let variance = squared_diff.mean(0)?;
|
||||
let std_dev = variance.sqrt()?;
|
||||
let normalized = data.broadcast_sub(&mean)?.broadcast_div(&std_dev)?;
|
||||
Ok(normalized)
|
||||
}
|
||||
|
||||
fn cov(data: &Tensor, device: &Device) -> Result<Tensor> {
|
||||
let mean = data.mean(0)?;
|
||||
let centered = data.broadcast_sub(&mean)?;
|
||||
let (m, _) = data.shape().dims2()?;
|
||||
let cov = centered
|
||||
.transpose(D::Minus1, D::Minus2)?
|
||||
.matmul(¢ered)?
|
||||
.broadcast_div(&Tensor::new(m as f32, device)?)?;
|
||||
|
||||
Ok(cov)
|
||||
}
|
||||
|
||||
fn pca(normalized_data: &Tensor, device: &Device, variance: f32) -> Result<Tensor> {
|
||||
let (_, n) = normalized_data.shape().dims2()?;
|
||||
let cov = cov(normalized_data, device)?;
|
||||
let vec: Vec<f32> = cov
|
||||
.to_device(&Device::Cpu)?
|
||||
.to_vec2()?
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect();
|
||||
let dmatrix = DMatrix::from_vec(n, n, vec);
|
||||
let eig = SymmetricEigen::new(dmatrix);
|
||||
let eigen_values = eig.eigenvalues.data.as_vec();
|
||||
let total = eigen_values.iter().sum::<f32>();
|
||||
let mut k = 0;
|
||||
for i in 0..n {
|
||||
let var = eigen_values[0..i].iter().sum::<f32>() / total;
|
||||
if var > variance {
|
||||
println!("{} components explain {}% of the variance", i, var * 100.0);
|
||||
k = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let eigen_vectors = eig.eigenvectors.data.as_vec();
|
||||
let eigen_vectors = eigen_vectors
|
||||
.chunks(n)
|
||||
.take(k)
|
||||
.flatten()
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
let eigen_vectors = Tensor::from_slice(eigen_vectors.as_slice(), (k, n), device)?;
|
||||
Ok(eigen_vectors)
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
// Data CSV file from https://www.kaggle.com/datasets/uciml/iris/data
|
||||
#[arg(long)]
|
||||
data_csv: String,
|
||||
|
||||
#[arg(long, default_value = "0.95")]
|
||||
variance: f32,
|
||||
}
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
let device = Device::cuda_if_available(0)?;
|
||||
let data = load_dataset(&args.data_csv, &device).unwrap();
|
||||
let normalized_data = z_score_normalize(&data)?;
|
||||
let reduce = pca(&normalized_data, &device, args.variance)?;
|
||||
let compressed_data = data.matmul(&reduce.transpose(D::Minus1, D::Minus2)?)?;
|
||||
println!("Compressed data {:?}", compressed_data);
|
||||
Ok(())
|
||||
}
|
||||
13
recommender-system/Cargo.toml
Normal file
13
recommender-system/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "recommender-system"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
csv = "1.1.6"
|
||||
anyhow = "1.0.40"
|
||||
clap = {version = "4.3.1", features = ["derive"]}
|
||||
rand = "0.8.5"
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }
|
||||
3
recommender-system/README.md
Normal file
3
recommender-system/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Recommender System
|
||||
|
||||
Build a Movie [recommender system](https://youtu.be/GIcuSNAAa4g?si=eiKFRfJXek15lO2_) using Collaborative filtering learning algorithm for the [MovieLens-100K](https://www.kaggle.com/datasets/rajmehra03/movielens100k/code) dataset.
|
||||
234
recommender-system/src/main.rs
Normal file
234
recommender-system/src/main.rs
Normal file
@@ -0,0 +1,234 @@
|
||||
extern crate csv;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use std::collections::HashSet;
|
||||
use std::vec;
|
||||
use std::{cmp::Ordering, collections::HashMap};
|
||||
|
||||
use anyhow::Result;
|
||||
use candle_core::{Device, Tensor, D};
|
||||
use clap::Parser;
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
|
||||
struct Rating {
|
||||
user: u32,
|
||||
movie: u32,
|
||||
rating_u32: u32,
|
||||
}
|
||||
|
||||
impl Rating {
|
||||
fn rating(&self) -> f32 {
|
||||
self.rating_u32 as f32 / 10.0
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Implement `PartialOrd` and `Ord` for the struct.
|
||||
impl PartialOrd for Rating {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Rating {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// First compare by `userId`, then by `movieId`.
|
||||
self.user
|
||||
.cmp(&other.user)
|
||||
.then_with(|| self.movie.cmp(&other.movie))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
|
||||
struct MovieDistance {
|
||||
id: u32,
|
||||
distance: u32,
|
||||
}
|
||||
|
||||
// Step 2: Implement `PartialOrd` and `Ord` for the struct.
|
||||
impl PartialOrd for MovieDistance {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for MovieDistance {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.distance.cmp(&other.distance)
|
||||
}
|
||||
}
|
||||
|
||||
impl MovieDistance {
|
||||
fn new(id: u32, distance: u32) -> Self {
|
||||
Self { id, distance }
|
||||
}
|
||||
}
|
||||
|
||||
fn load_ratings(file_path: &str) -> Result<(HashSet<u32>, HashSet<u32>, HashSet<Rating>)> {
|
||||
let mut rdr = csv::Reader::from_path(file_path)?;
|
||||
let mut users: HashSet<u32> = HashSet::new();
|
||||
let mut movies: HashSet<u32> = HashSet::new();
|
||||
let mut ratings: HashSet<Rating> = HashSet::new();
|
||||
|
||||
for result in rdr.records() {
|
||||
let record = result?;
|
||||
let user: u32 = record[0].parse()?;
|
||||
let movie: u32 = record[1].parse()?;
|
||||
let rating: f32 = record[2].parse()?;
|
||||
let rating_u32 = (rating * 10.0).round() as u32;
|
||||
users.insert(user);
|
||||
movies.insert(movie);
|
||||
ratings.insert(Rating {
|
||||
user,
|
||||
movie,
|
||||
rating_u32,
|
||||
});
|
||||
}
|
||||
|
||||
Ok((users, movies, ratings))
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
// Data CSV file from https://www.kaggle.com/c/eecs498/data
|
||||
#[arg(long)]
|
||||
ratings_csv: String,
|
||||
|
||||
#[arg(long)]
|
||||
movies_csv: String,
|
||||
|
||||
// Number of epochs to train
|
||||
#[arg(long, default_value = "250")]
|
||||
epochs: u32,
|
||||
|
||||
// Learning rate
|
||||
#[arg(long, default_value = "0.01")]
|
||||
lr: f32,
|
||||
|
||||
// Regularization factor
|
||||
#[arg(long, default_value = "0.01")]
|
||||
reg: f32,
|
||||
|
||||
// Number of features
|
||||
#[arg(long, default_value = "100")]
|
||||
n_features: usize,
|
||||
}
|
||||
|
||||
fn cdist(x1: &Tensor, x2: &Tensor) -> Result<Tensor> {
|
||||
let diff = x1.sub(&x2)?;
|
||||
let dist = diff.sqr()?.sum_all()?.sqrt()?;
|
||||
Ok(dist)
|
||||
}
|
||||
|
||||
fn mean_normalization(ratings: &Tensor, R: &Tensor) -> Result<Tensor> {
|
||||
let sum = ratings.mul(&R)?.sum(1)?;
|
||||
let count = R.sum(1)?;
|
||||
let mean = sum.div(&count)?;
|
||||
let adjusted = ratings.broadcast_sub(&mean.unsqueeze(1)?)?;
|
||||
Ok(adjusted)
|
||||
}
|
||||
|
||||
fn cost(X: &Tensor, W: &Tensor, Y: &Tensor, R: &Tensor) -> Result<f32> {
|
||||
let c = X
|
||||
.matmul(&W.t()?)?
|
||||
.mul(&R)?
|
||||
.sub(&Y.mul(&R)?)?
|
||||
.sqr()?
|
||||
.sum_all()?
|
||||
.to_scalar::<f32>()?;
|
||||
Ok(c)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
let reg = Tensor::new(args.reg, &Device::cuda_if_available(0)?)?;
|
||||
let lr = Tensor::new(args.lr, &Device::cuda_if_available(0)?)?;
|
||||
|
||||
let device = Device::cuda_if_available(0)?;
|
||||
|
||||
let (users, movies, ratings) = load_ratings(&args.ratings_csv).unwrap();
|
||||
let mut users: Vec<u32> = users.into_iter().collect();
|
||||
users.sort();
|
||||
|
||||
let mut movies: Vec<u32> = movies.into_iter().collect();
|
||||
movies.sort();
|
||||
|
||||
let mut ratings: Vec<Rating> = ratings.into_iter().collect();
|
||||
ratings.sort();
|
||||
|
||||
let n_users = users.len();
|
||||
let n_movies = movies.len();
|
||||
|
||||
println!("n_users: {}, n_movies: {}", n_users, n_movies);
|
||||
|
||||
let mut Y = vec![vec![-1.0; n_users as usize]; n_movies as usize];
|
||||
let mut R = vec![vec![0.0; n_users as usize]; n_movies as usize];
|
||||
|
||||
for rating in ratings.iter() {
|
||||
let i = movies.iter().position(|&x| x == rating.movie).unwrap();
|
||||
let j = users.iter().position(|&x| x == rating.user).unwrap();
|
||||
Y[i][j] = rating.rating();
|
||||
R[i][j] = 1.0;
|
||||
}
|
||||
let R = R.iter().flatten().copied().collect::<Vec<f32>>();
|
||||
let R = Tensor::from_slice(&R, (n_movies, n_users), &device)?;
|
||||
|
||||
let Y = Y.iter().flatten().copied().collect::<Vec<f32>>();
|
||||
let Y = Tensor::from_slice(&Y, (n_movies, n_users), &device)?;
|
||||
let Y = mean_normalization(&Y, &R)?;
|
||||
|
||||
let mut X = Tensor::randn(0f32, 0.1, (n_movies, args.n_features), &device)?;
|
||||
let mut W = Tensor::randn(0f32, 0.1, (n_users, args.n_features), &device)?;
|
||||
|
||||
for i in 0..args.epochs {
|
||||
let diff = X.matmul(&W.t()?)?.mul(&R)?.sub(&Y.mul(&R)?)?;
|
||||
let grad_X = diff.matmul(&W)?.add(&X.broadcast_mul(®)?)?;
|
||||
let grad_W = diff.t()?.matmul(&X)?.add(&W.broadcast_mul(®)?)?;
|
||||
|
||||
X = X.sub(&grad_X.broadcast_mul(&lr)?)?;
|
||||
W = W.sub(&grad_W.broadcast_mul(&lr)?)?;
|
||||
}
|
||||
|
||||
// Load movie titles
|
||||
let mut rdr = csv::Reader::from_path(&args.movies_csv)?;
|
||||
let mut movie_titles = HashMap::new();
|
||||
for result in rdr.records() {
|
||||
let record = result?;
|
||||
let movie_id: u32 = record[0].parse()?;
|
||||
let title = record[1].to_string();
|
||||
movie_titles.insert(movie_id, title);
|
||||
}
|
||||
|
||||
// Choose a random movie and find similar movies
|
||||
let mut rng = thread_rng();
|
||||
let random_movie_id = movies.choose(&mut rng).unwrap();
|
||||
println!("Random movie: {}", movie_titles[random_movie_id]);
|
||||
|
||||
let random_movie_idx = movies.iter().position(|&x| x == *random_movie_id).unwrap();
|
||||
let random_index_tensor = Tensor::from_slice(&[random_movie_idx as u32], &[1], &device)?;
|
||||
let random_movie_features = X.index_select(&random_index_tensor, 0)?;
|
||||
|
||||
let mut movie_distances: Vec<MovieDistance> = Vec::new();
|
||||
for i in 0..n_movies {
|
||||
let movie_index_tensor = Tensor::from_slice(&[i as u32], &[1], &device)?;
|
||||
let movie_features = X.index_select(&movie_index_tensor, 0)?;
|
||||
let dist = cdist(&random_movie_features, &movie_features)?;
|
||||
let dist = dist.to_scalar::<f32>()?;
|
||||
let movie_distance = MovieDistance::new(movies[i], (dist * 1000.0) as u32);
|
||||
movie_distances.push(movie_distance);
|
||||
}
|
||||
|
||||
movie_distances.sort();
|
||||
for i in 0..10 {
|
||||
let movie_id = movie_distances[i].id;
|
||||
let distance = movie_distances[i].distance;
|
||||
println!(
|
||||
"{}: {} (distance: {})",
|
||||
i + 1,
|
||||
movie_titles[&movie_id],
|
||||
distance
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user