Inital commit

2025-12-22 22:19:58 +00:00 · 2024-04-01 20:19:08 -04:00
commit 26c8803281
22 changed files with 1226 additions and 0 deletions
--- a/recommender-system/Cargo.toml
+++ b/recommender-system/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "recommender-system"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+csv = "1.1.6"
+anyhow = "1.0.40"
+clap = {version = "4.3.1", features = ["derive"]} 
+rand = "0.8.5"
+candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }
--- a/recommender-system/README.md
+++ b/recommender-system/README.md
@@ -0,0 +1,3 @@
+# Recommender System
+
+Build a Movie [recommender system](https://youtu.be/GIcuSNAAa4g?si=eiKFRfJXek15lO2_) using Collaborative filtering learning algorithm for the [MovieLens-100K](https://www.kaggle.com/datasets/rajmehra03/movielens100k/code) dataset.
--- a/recommender-system/src/main.rs
+++ b/recommender-system/src/main.rs
@@ -0,0 +1,234 @@
+extern crate csv;
+use rand::seq::SliceRandom;
+use rand::thread_rng;
+use std::collections::HashSet;
+use std::vec;
+use std::{cmp::Ordering, collections::HashMap};
+
+use anyhow::Result;
+use candle_core::{Device, Tensor, D};
+use clap::Parser;
+
+#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
+struct Rating {
+    user: u32,
+    movie: u32,
+    rating_u32: u32,
+}
+
+impl Rating {
+    fn rating(&self) -> f32 {
+        self.rating_u32 as f32 / 10.0
+    }
+}
+
+// Step 2: Implement `PartialOrd` and `Ord` for the struct.
+impl PartialOrd for Rating {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Rating {
+    fn cmp(&self, other: &Self) -> Ordering {
+        // First compare by `userId`, then by `movieId`.
+        self.user
+            .cmp(&other.user)
+            .then_with(|| self.movie.cmp(&other.movie))
+    }
+}
+
+#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
+struct MovieDistance {
+    id: u32,
+    distance: u32,
+}
+
+// Step 2: Implement `PartialOrd` and `Ord` for the struct.
+impl PartialOrd for MovieDistance {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for MovieDistance {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.distance.cmp(&other.distance)
+    }
+}
+
+impl MovieDistance {
+    fn new(id: u32, distance: u32) -> Self {
+        Self { id, distance }
+    }
+}
+
+fn load_ratings(file_path: &str) -> Result<(HashSet<u32>, HashSet<u32>, HashSet<Rating>)> {
+    let mut rdr = csv::Reader::from_path(file_path)?;
+    let mut users: HashSet<u32> = HashSet::new();
+    let mut movies: HashSet<u32> = HashSet::new();
+    let mut ratings: HashSet<Rating> = HashSet::new();
+
+    for result in rdr.records() {
+        let record = result?;
+        let user: u32 = record[0].parse()?;
+        let movie: u32 = record[1].parse()?;
+        let rating: f32 = record[2].parse()?;
+        let rating_u32 = (rating * 10.0).round() as u32;
+        users.insert(user);
+        movies.insert(movie);
+        ratings.insert(Rating {
+            user,
+            movie,
+            rating_u32,
+        });
+    }
+
+    Ok((users, movies, ratings))
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    // Data CSV file from https://www.kaggle.com/c/eecs498/data
+    #[arg(long)]
+    ratings_csv: String,
+
+    #[arg(long)]
+    movies_csv: String,
+
+    // Number of epochs to train
+    #[arg(long, default_value = "250")]
+    epochs: u32,
+
+    // Learning rate
+    #[arg(long, default_value = "0.01")]
+    lr: f32,
+
+    // Regularization factor
+    #[arg(long, default_value = "0.01")]
+    reg: f32,
+
+    // Number of features
+    #[arg(long, default_value = "100")]
+    n_features: usize,
+}
+
+fn cdist(x1: &Tensor, x2: &Tensor) -> Result<Tensor> {
+    let diff = x1.sub(&x2)?;
+    let dist = diff.sqr()?.sum_all()?.sqrt()?;
+    Ok(dist)
+}
+
+fn mean_normalization(ratings: &Tensor, R: &Tensor) -> Result<Tensor> {
+    let sum = ratings.mul(&R)?.sum(1)?;
+    let count = R.sum(1)?;
+    let mean = sum.div(&count)?;
+    let adjusted = ratings.broadcast_sub(&mean.unsqueeze(1)?)?;
+    Ok(adjusted)
+}
+
+fn cost(X: &Tensor, W: &Tensor, Y: &Tensor, R: &Tensor) -> Result<f32> {
+    let c = X
+        .matmul(&W.t()?)?
+        .mul(&R)?
+        .sub(&Y.mul(&R)?)?
+        .sqr()?
+        .sum_all()?
+        .to_scalar::<f32>()?;
+    Ok(c)
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+    let reg = Tensor::new(args.reg, &Device::cuda_if_available(0)?)?;
+    let lr = Tensor::new(args.lr, &Device::cuda_if_available(0)?)?;
+
+    let device = Device::cuda_if_available(0)?;
+
+    let (users, movies, ratings) = load_ratings(&args.ratings_csv).unwrap();
+    let mut users: Vec<u32> = users.into_iter().collect();
+    users.sort();
+
+    let mut movies: Vec<u32> = movies.into_iter().collect();
+    movies.sort();
+
+    let mut ratings: Vec<Rating> = ratings.into_iter().collect();
+    ratings.sort();
+
+    let n_users = users.len();
+    let n_movies = movies.len();
+
+    println!("n_users: {}, n_movies: {}", n_users, n_movies);
+
+    let mut Y = vec![vec![-1.0; n_users as usize]; n_movies as usize];
+    let mut R = vec![vec![0.0; n_users as usize]; n_movies as usize];
+
+    for rating in ratings.iter() {
+        let i = movies.iter().position(|&x| x == rating.movie).unwrap();
+        let j = users.iter().position(|&x| x == rating.user).unwrap();
+        Y[i][j] = rating.rating();
+        R[i][j] = 1.0;
+    }
+    let R = R.iter().flatten().copied().collect::<Vec<f32>>();
+    let R = Tensor::from_slice(&R, (n_movies, n_users), &device)?;
+
+    let Y = Y.iter().flatten().copied().collect::<Vec<f32>>();
+    let Y = Tensor::from_slice(&Y, (n_movies, n_users), &device)?;
+    let Y = mean_normalization(&Y, &R)?;
+
+    let mut X = Tensor::randn(0f32, 0.1, (n_movies, args.n_features), &device)?;
+    let mut W = Tensor::randn(0f32, 0.1, (n_users, args.n_features), &device)?;
+
+    for i in 0..args.epochs {
+        let diff = X.matmul(&W.t()?)?.mul(&R)?.sub(&Y.mul(&R)?)?;
+        let grad_X = diff.matmul(&W)?.add(&X.broadcast_mul(&reg)?)?;
+        let grad_W = diff.t()?.matmul(&X)?.add(&W.broadcast_mul(&reg)?)?;
+
+        X = X.sub(&grad_X.broadcast_mul(&lr)?)?;
+        W = W.sub(&grad_W.broadcast_mul(&lr)?)?;
+    }
+
+    // Load movie titles
+    let mut rdr = csv::Reader::from_path(&args.movies_csv)?;
+    let mut movie_titles = HashMap::new();
+    for result in rdr.records() {
+        let record = result?;
+        let movie_id: u32 = record[0].parse()?;
+        let title = record[1].to_string();
+        movie_titles.insert(movie_id, title);
+    }
+
+    // Choose a random movie and find similar movies
+    let mut rng = thread_rng();
+    let random_movie_id = movies.choose(&mut rng).unwrap();
+    println!("Random movie: {}", movie_titles[random_movie_id]);
+
+    let random_movie_idx = movies.iter().position(|&x| x == *random_movie_id).unwrap();
+    let random_index_tensor = Tensor::from_slice(&[random_movie_idx as u32], &[1], &device)?;
+    let random_movie_features = X.index_select(&random_index_tensor, 0)?;
+
+    let mut movie_distances: Vec<MovieDistance> = Vec::new();
+    for i in 0..n_movies {
+        let movie_index_tensor = Tensor::from_slice(&[i as u32], &[1], &device)?;
+        let movie_features = X.index_select(&movie_index_tensor, 0)?;
+        let dist = cdist(&random_movie_features, &movie_features)?;
+        let dist = dist.to_scalar::<f32>()?;
+        let movie_distance = MovieDistance::new(movies[i], (dist * 1000.0) as u32);
+        movie_distances.push(movie_distance);
+    }
+
+    movie_distances.sort();
+    for i in 0..10 {
+        let movie_id = movie_distances[i].id;
+        let distance = movie_distances[i].distance;
+        println!(
+            "{}: {} (distance: {})",
+            i + 1,
+            movie_titles[&movie_id],
+            distance
+        );
+    }
+
+    Ok(())
+}