diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000000000000000000000000000000000..a077abe82fd2d455d5d3213f329799e27fe5ebd6 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "suggestible" +version = "0.1.0" +edition = "2021" + +[dependencies] +tiny_http = "0.11" +csv = "1.1" +url = "2.5.0" diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c0ded606f72a8cbad8597deb37c97407e473416b --- /dev/null +++ b/Makefile @@ -0,0 +1,43 @@ +# Rust project name +PROJECT_NAME := Suggestible + +# Directories +SRC_DIR := src +DOC_DIR := doc +TARGET_DIR := target + +# Binary name +BIN_NAME := $(PROJECT_NAME) + +# Binary paths +BIN_PATH := $(TARGET_DIR)/debug/$(BIN_NAME) +RELEASE_BIN_PATH := $(TARGET_DIR)/release/$(BIN_NAME) + +# Compilation commands +RUSTC := rustc +CARGO := cargo + +# Documentation commands +MDBOOK := mdbook + +# Targets +.PHONY: all clean doc + +all: $(BIN_PATH) + +release: $(RELEASE_BIN_PATH) + +$(BIN_PATH): $(SRC_DIR)/*.rs + $(CARGO) build + +$(RELEASE_BIN_PATH): $(SRC_DIR)/*.rs + $(CARGO) build --release + +doc: + $(CARGO) doc --no-deps + cp -r target/doc/* doc/dev/rustdoc + $(MDBOOK) build + +clean: + $(CARGO) clean + $(MDBOOK) clean diff --git a/book.toml b/book.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f23a7add110fd079b2bed55ac7de7240a9e6513 --- /dev/null +++ b/book.toml @@ -0,0 +1,9 @@ +[book] +authors = ["Phil Höfer"] +language = "en" +multilingual = false +src = "doc" +title = "Suggestible" + +[output.html] +mathjax-support = true \ No newline at end of file diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f9d91786ae21d2211f56f8d1a73c366678899935 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,23 @@ +# Introduction + +Suggestible is software toolbox for creating an auto-suggestion endpoint from a search query log file. +It aims to be a complete solution including features like: +- parsing and filtering +- multiple configurable suggestion algorithms +- automatic model updates from an incoming stream of query logs +- ready-to-use OpenSearch suggest server + +## About this Documentation + +Documentation is structured in two parts: User documentation and Development documentation. If you just want to use the software, the former is for you. +Furthermore, we acknowledge divio's [documentation system](https://docs.divio.com/documentation-system/), consisting of: +* learning oriented **tutorials â—†**, +* problem oriented **how-to guides â—‡**, +* understanding oriented **explanations â—ˆ** and +* information oriented **references â–**. + +You can see which is which using the icons placed after a documents title. + +## About the Software + +Suggestible is written in Rust, which means it can be very resource efficient while also being very safe. diff --git a/doc/SUMMARY.md b/doc/SUMMARY.md new file mode 100644 index 0000000000000000000000000000000000000000..5d0c65f40bae7cbb53ee6b4b63000e4132f60ff7 --- /dev/null +++ b/doc/SUMMARY.md @@ -0,0 +1,20 @@ +# Summary + +[Introduction](README.md) + +# User Documentation +- [Overview](./user/index.md) +- [Trying out Suggestible â—†](./user/tutorial_basic.md) +- [Building Suggestible â—‡](./user/howto_build.md) +- [Using Suggestible â—‡](./user/howto_basic.md) +- [Configuration Options â–](./user/ref_config.md) + - [Prediction Models â–](./user/ref_predmodels.md) +- [Further Reading]() + - [Unterstanding Markov Chains â—ˆ](./user/exp_markov.md) + +# Developer Documentation +- [Overview](./dev/index.md) +- [Rustdoc â–](./dev/rustdoc/suggestible/index.html) +- [Auxillary Documentation]() + - [Filtering â–](./dev/ref_filtering.md) + - [Glossary â–](./dev/ref_glossary.md) diff --git a/doc/dev/index.md b/doc/dev/index.md new file mode 100644 index 0000000000000000000000000000000000000000..8a0d9d896fa1344019731d81b2db48b148ccd9a0 --- /dev/null +++ b/doc/dev/index.md @@ -0,0 +1,3 @@ +# Overview + +- [Filtering](filtering.md) diff --git a/doc/dev/ref_filtering.md b/doc/dev/ref_filtering.md new file mode 100644 index 0000000000000000000000000000000000000000..d0200e110628eef9aeb33599d348378592e532b4 --- /dev/null +++ b/doc/dev/ref_filtering.md @@ -0,0 +1,14 @@ +# Filtering +This document outlines the steps and measures we take to hopefully prevent Suggestible from giving suggestions that expose unwanted qualities of the training data. + +## Risks +The indentified risks for Suggestible are as follows: +1. Leakage of personal data in the training data into the precictive model +2. Reproduction of (political or other) biases from the training data +3. Suggestion of NSFW query completions for SFW prompts +4. Output of content (or links) that are prohibited to share (ex. IP law) +4. Output of irrelevant content + +## Measures + +### Leakage of personal data in the training data into the precictive model \ No newline at end of file diff --git a/doc/dev/ref_glossary.md b/doc/dev/ref_glossary.md new file mode 100644 index 0000000000000000000000000000000000000000..7844001db46fd120c13e5a7ae99a71d84f16c6b5 --- /dev/null +++ b/doc/dev/ref_glossary.md @@ -0,0 +1,14 @@ +# Glossary â– + +- **Predictor**: + - Algorithm used to predict a suggestion. + +- **Prediction Model**: + - Database used to supplement a predictor. Can be trained from search query input data. + +- **Markov Chain**: + - Stochastic prediction model describing a sequence of possible words where the probability of each word depends only on the previous word. + +- **Markov Chain Predictor**: + - Predictor utilising a Markov chain. + diff --git a/doc/user/exp_markov.md b/doc/user/exp_markov.md new file mode 100644 index 0000000000000000000000000000000000000000..da69badac83a575ee559934fc26a97455394208d --- /dev/null +++ b/doc/user/exp_markov.md @@ -0,0 +1,70 @@ +# Understanding Markov Chains + +**Markov Chains** are mathematical systems that undergo transitions from one state to another within a finite or countable number of possible states. Named after the Russian mathematician Andrey Markov, these chains are widely used in statistical modeling and various practical applications, such as predicting the next word in a search query. + +## How Markov Chains Work + +A Markov Chain follows the *Markov property*, which states that the future state depends only on the current state and not on the sequence of events that preceded it. This property is often summarized as "memorylessness." + +#### Key Components + +1. **States**: Distinct conditions or positions in which the system can exist. For example, in predicting the next word, each state represents a different word. +2. **Transition Probabilities**: The probabilities of moving from one state to another. These are usually represented in a *transition matrix*. + +### Transition Matrix + +A transition matrix \\( P \\) is a square matrix used to describe the probabilities of transitioning from one state to another. Each element \\( P_{ij} \\) represents the probability of moving from state \\( i \\) to state \\( j \\). + +Let's consider two search queries: "buy cheap laptops" and "buy cheap flights." The words involved are "buy", "cheap", "laptops", and "flights". The transition matrix might look like this: + +| | Buy | Cheap | Laptops | Flights | +|--------------|-----|-------|---------|---------| +| **Buy** | 0.0 | 1.0 | 0.0 | 0.0 | +| **Cheap** | 0.0 | 0.0 | 0.5 | 0.5 | +| **Laptops** | 0.0 | 0.0 | 0.0 | 0.0 | +| **Flights** | 0.0 | 0.0 | 0.0 | 0.0 | + +In this matrix: +- \\( P_{12} = 1.0 \\) means there's a 100% chance of "cheap" following "buy". +- \\( P_{23} = 0.5 \\) means there's a 50% chance of "laptops" following "cheap". +- \\( P_{24} = 0.5 \\) means there's a 50% chance of "flights" following "cheap". + +### Predicting the Next Word + +Let's say you're typing a search query, and the current state is "cheap". To predict the next word using a Markov Chain, we look at the row corresponding to "cheap" in the transition matrix: + +| Word | Probability | +|--------|-------------| +| Laptops | 0.5 | +| Flights | 0.5 | + +The probabilities tell us the likelihood of each possible next word. Here, "laptops" and "flights" have equal probabilities of 0.5. + +## Example Calculation + +If we start with the words "buy cheap" and want to predict the next word, we can use the transition matrix to calculate the probabilities step-by-step: + +1. **Current State**: "buy" +2. **Next State Probabilities**: + - "cheap": 1.0 + +Assuming "cheap" is chosen (since it's the only option), we then look at the row for "cheap": + +3. **Next State after "cheap"**: + - "laptops": 0.5 + - "flights": 0.5 + +Since "laptops" and "flights" have equal probabilities, either could be the next word. Thus, "buy cheap laptops" and "buy cheap flights" are both valid predictions. + +### Applications Beyond Search Queries + +Markov Chains are used in various domains such as: + +- **Text Generation**: Creating sentences or paragraphs by predicting the next word. +- **Financial Modeling**: Predicting stock prices and market trends. +- **Game Theory**: Modeling decisions in strategic games. + + +## Conclusion + +Markov Chains are valuable tools for modeling systems with probabilistic transitions between states. By understanding the current state and transition probabilities, we can predict future states effectively. However, it's crucial to be aware of their limitations and biases to ensure accurate and unbiased predictions. \ No newline at end of file diff --git a/doc/user/howto_basic.md b/doc/user/howto_basic.md new file mode 100644 index 0000000000000000000000000000000000000000..4d46ab1e3dc83fcf738415b9cce167ee9c4b8ed3 --- /dev/null +++ b/doc/user/howto_basic.md @@ -0,0 +1,5 @@ +# Using Suggestible â—‡ +This how-to helps you set up Suggestible using your own data. + +If you want to just try out Suggestible with some sample data at first, we recommend to try the [introductory tutorial](tutorial_basic.md) first. +If you don't have a compiled version of Suggestible yet, follow our [build how-to](howto_build.md). \ No newline at end of file diff --git a/doc/user/howto_build.md b/doc/user/howto_build.md new file mode 100644 index 0000000000000000000000000000000000000000..35191a8c859d353565dada6e2333091318953b88 --- /dev/null +++ b/doc/user/howto_build.md @@ -0,0 +1 @@ +# Building Suggestible â—‡ diff --git a/doc/user/index.md b/doc/user/index.md new file mode 100644 index 0000000000000000000000000000000000000000..2672351d684c086814ff1b2ec78978a2f287f570 --- /dev/null +++ b/doc/user/index.md @@ -0,0 +1,13 @@ +# Overview + +## Reading Journeys + +### Show me what Suggestible can do + +1. [Trying out Suggestible â—†](tutorial_basic.md) +2. [Unterstanding Markov Chains â—ˆ](exp_markov.md) + +### I want to use Suggestible immediately + +1. [Building Suggestible â—‡](howto_build.md) +2. [Using Suggestible â—‡](howto_basic.md) \ No newline at end of file diff --git a/doc/user/ref_config.md b/doc/user/ref_config.md new file mode 100644 index 0000000000000000000000000000000000000000..83708001af007847bc5dcb76d8178cde503fbbb3 --- /dev/null +++ b/doc/user/ref_config.md @@ -0,0 +1 @@ +# Configuration Options â– diff --git a/doc/user/ref_predmodels.md b/doc/user/ref_predmodels.md new file mode 100644 index 0000000000000000000000000000000000000000..2d456f1a580594117f2e89d26a0732c0fc189705 --- /dev/null +++ b/doc/user/ref_predmodels.md @@ -0,0 +1 @@ +# Prediction Models â– diff --git a/doc/user/tutorial_basic.md b/doc/user/tutorial_basic.md new file mode 100644 index 0000000000000000000000000000000000000000..15aea66d57ce035e44313f9c33efb40cffe86000 --- /dev/null +++ b/doc/user/tutorial_basic.md @@ -0,0 +1 @@ +# Trying out Suggestible diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000000000000000000000000000000000000..3fcb08715b0e717b8d8f4fc921388b8cb5fc2895 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,153 @@ +use std::collections::HashMap; +use std::fs::File; +use std::io::{self, BufRead, BufReader}; +use std::error::Error; +use std::str::FromStr; + +use csv::ReaderBuilder; + + +type MarkovChain = HashMap<String, HashMap<String, usize>>; + +use tiny_http::{Server, Response}; +use url::Url; + + + +fn main() -> Result<(), io::Error> { + + let markov_chain = build_markov_chain("data_full.csv")?; + let filtered_markov_chain = filter_markov_chain(&markov_chain,1); + + // Print the Markov Chain for verification + for (key, values) in &markov_chain { + // println!("{}: {:?}", key, values); + } + + // Test the function + let word = "wie"; // replace with the word you want to check + if let Some(top_words) = get_top_following_words(&filtered_markov_chain, word, 3) { + println!("Top following words for '{}': {:?}", word, top_words); + } else { + println!("No following words found for '{}'", word); + } + + + + let server = Server::http("0.0.0.0:80").unwrap(); + + for request in server.incoming_requests() { + // println!("received request! method: {:?}, url: {:?}, headers: {:?}", + // request.method(), + // request.url(), + // request.headers() + // ); + let query = get_query(request.url()); + match query { + Ok(query) => { + let prediction = predictn(&filtered_markov_chain, &query,5); + println!("Query: {}, Prediction:{}", query, prediction); + let response = Response::from_string(prediction); + request.respond(response); + }, + Err(e) => { + println!("Error: {}",e); + } + } + } + + Ok(()) +} + +fn get_query(request_url: &str) -> Result<String, url::ParseError> { + let parsed_url = request_url.strip_prefix("/?").unwrap_or(request_url); + let query_pairs = url::form_urlencoded::parse(parsed_url.as_bytes()); + for (key, value) in query_pairs { + if key == "q" { + return Ok(value.into_owned()); + } + } + Ok(String::from_str("").unwrap()) +} + +fn build_markov_chain(file_path: &str) -> Result<MarkovChain, io::Error> { + let file = File::open(file_path)?; + let mut reader = ReaderBuilder::new().from_reader(file); + let mut markov_chain: MarkovChain = HashMap::new(); + + for result in reader.records() { + let record = result?; + if let Some(query) = record.get(5) { + let lowercase_query = query.to_lowercase(); + let words: Vec<&str> = lowercase_query.split_whitespace().collect(); + for window in words.windows(2) { + if let [first, second] = window { + markov_chain + .entry(first.to_string().to_lowercase()) + .or_default() + .entry(second.to_string()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + } + } + } + + Ok(markov_chain) +} + +fn filter_markov_chain(markov_chain: &MarkovChain, min_count: usize) -> MarkovChain { + let mut filtered_chain: MarkovChain = HashMap::new(); + + for (key, followers) in markov_chain { + let filtered_followers: HashMap<String, usize> = followers + .iter() + .filter(|&(_, &count)| count >= min_count) + .map(|(word, &count)| (word.clone(), count)) + .collect(); + + if !filtered_followers.is_empty() { + filtered_chain.insert(key.clone(), filtered_followers); + } + } + + filtered_chain +} + +fn get_top_following_words( + markov_chain: &MarkovChain, + word: &str, + top_n: usize, +) -> Option<Vec<(String, usize)>> { + let following_words = markov_chain.get(word)?; + + // Collect and sort the words by their counts in descending order + let mut sorted_words: Vec<(String, usize)> = following_words.iter() + .map(|(word, &count)| (word.clone(), count)) + .collect(); + sorted_words.sort_by(|a, b| b.1.cmp(&a.1)); + + // Return the top N words + Some(sorted_words.into_iter().take(top_n).collect()) +} + +fn predict(markov_chain: &MarkovChain, query: &str) -> String { + if let Some(top_words) = get_top_following_words(markov_chain, query, 1) { + if let Some((predicted_word, _)) = top_words.first() { + return format!("{} {}", query, predicted_word); + } + } + String::new() +} + +fn predictn(markov_chain: &MarkovChain, query: &str, n: usize) -> String { + if let Some(top_words) = get_top_following_words(markov_chain, query, n) { + let predictions: Vec<String> = top_words.into_iter() + .map(|(word, _)| format!("\"{} {}\"",query, word)) + .collect(); + return format!("[\"{}\",[{}]]",query, predictions.join(",")); + } + String::new() +} + +