Inital commit

This commit is contained in:
Vishal Patil
2024-04-01 20:19:08 -04:00
commit 26c8803281
22 changed files with 1226 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
[package]
name = "anamoly-detection"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
csv = "1.1.6"
anyhow = "1.0.40"
clap = {version = "4.3.1", features = ["derive"]}
rand = "0.8.5"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.4.1", features = ["cuda"] }

View File

@@ -0,0 +1,3 @@
# Anamoly Detection
[Anamoly Detection](https://youtu.be/UqqPm-Q4aMo?si=TCZFJOJv94R1i71u) using Gaussian Distribution for the Kaggle [EECS 498 dataset](https://www.kaggle.com/c/eecs498/data).

View File

@@ -0,0 +1,100 @@
extern crate csv;
use std::vec;
use anyhow::Result;
use candle_core::{Device, Tensor};
use clap::Parser;
fn load_dataset(file_path: &str, device: &Device) -> Result<Tensor> {
let mut rdr = csv::Reader::from_path(file_path)?;
let mut data = Vec::new();
for result in rdr.records() {
let record = result?;
let mut row = vec![];
for i in 1..4 {
row.push(record[i].parse::<f64>()?);
}
data.push(row);
}
let feature_cnt = data[0].len();
let sample_cnt = data.len();
let data = data.into_iter().flatten().collect::<Vec<_>>();
let data = Tensor::from_slice(data.as_slice(), (sample_cnt, feature_cnt), device)?;
Ok(data)
}
fn z_score_normalize(data: &Tensor) -> Result<Tensor> {
let mean = data.mean(0)?;
let squared_diff = data.broadcast_sub(&mean)?.sqr()?;
let variance = squared_diff.mean(0)?;
let std_dev = variance.sqrt()?;
let normalized = data.broadcast_sub(&mean)?.broadcast_div(&std_dev)?;
Ok(normalized)
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
// Data CSV file from https://www.kaggle.com/c/eecs498/data
#[arg(long)]
data_csv: String,
#[arg(long, short, default_value = "false")]
print: bool,
#[arg(long, default_value = "0.001")]
episilon: f64,
}
fn p_x(
x: &Tensor,
mean: &Tensor,
two_variance: &Tensor,
two_pi_sqrt_std_dev: &Tensor,
) -> Result<f64> {
let px = x
.broadcast_sub(mean)?
.sqr()?
.broadcast_div(two_variance)?
.exp()?
.broadcast_mul(two_pi_sqrt_std_dev)?
.recip()?;
let px = px.to_vec1::<f64>()?.into_iter().fold(1.0, |acc, x| acc * x);
Ok(px)
}
fn main() -> Result<()> {
let args = Args::parse();
let device = Device::cuda_if_available(0)?;
let data = load_dataset(&args.data_csv, &device)?;
let data = z_score_normalize(&data)?;
let mean = data.mean(0)?;
let variance = data.broadcast_sub(&mean)?.sqr()?.mean(0)?;
let std_dev = variance.sqrt()?;
let two_variance = variance.broadcast_mul(&Tensor::new(2.0, &device)?)?;
let two_pi_sqrt_std_dev =
std_dev.broadcast_mul(&Tensor::new(2.0 * std::f64::consts::PI, &device)?.sqrt()?)?;
let rows = data.shape().dims2()?.0;
let mut anamolies = 0;
for row in 0..rows {
let row_tensor = data
.index_select(&Tensor::new(&[row as u32], &device)?, 0)?
.squeeze(0)?;
let px = p_x(&row_tensor, &mean, &two_variance, &two_pi_sqrt_std_dev)?;
if px < args.episilon {
anamolies += 1;
if args.print {
println!("Anamoly: {}", row + 1);
}
}
}
println!("Anamolies: {}, Total: {}", anamolies, rows);
Ok(())
}