Use MappingMinifier to minify crate's description

This commit is contained in:
Folyd 2020-01-10 11:54:35 +08:00
parent 57df7feb0e
commit 8e63b147eb
2 changed files with 73 additions and 72 deletions

View file

@ -11,9 +11,9 @@ use serde_derive::Deserialize;
use serde_json;
use tokio;
use tokio::time::Duration;
use unicode_segmentation::UnicodeSegmentation;
use lazy_static::lazy_static;
use minify::MappingMinifier;
mod minify;
@ -52,60 +52,6 @@ where
}))
}
#[derive(Debug)]
struct FrequencyWord {
word: String,
frequency: usize,
}
struct MappingGenerator {
words: Vec<FrequencyWord>,
}
impl MappingGenerator {
const UPPERCASE_LETTERS: &'static str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
fn new(words: &Vec<String>, top: usize) -> MappingGenerator {
assert!(top < Self::UPPERCASE_LETTERS.len());
let mut mapping: HashMap<String, usize> = HashMap::new();
words
.iter()
.flat_map(|sentence| {
sentence
.unicode_words()
.into_iter()
.filter(|word| word.len() >= 5)
.collect::<Vec<&str>>()
})
.for_each(|word| {
let count = mapping.entry(word.to_string()).or_insert(0);
*count += 1;
});
let mut frequency_words = mapping
.into_iter()
.map(|(word, frequency)| FrequencyWord { word, frequency })
.collect::<Vec<FrequencyWord>>();
frequency_words.sort_by(|a, b| b.frequency.cmp(&a.frequency));
MappingGenerator {
words: frequency_words.drain(0..=top).collect(),
}
}
fn generate_mapping(&self) -> HashMap<String, String> {
println!("words {:?}", self.words);
self.words
.iter()
.enumerate()
.map(|(index, fw)| {
(
format!("${}", Self::UPPERCASE_LETTERS.chars().nth(index).unwrap()),
fw.word.clone(),
)
})
.collect()
}
}
async fn fetch_crates(page: u32) -> Result<Vec<Crate>, Box<dyn std::error::Error>> {
// Keep 1 second sleep interval to comply crates.io crawler policy.
tokio::time::delay_for(Duration::from_secs((1 * (page - 1)) as u64)).await;
@ -119,7 +65,10 @@ async fn fetch_crates(page: u32) -> Result<Vec<Crate>, Box<dyn std::error::Error
Ok(resp.crates)
}
async fn generate_javascript_crates_index(crates: Vec<Crate>) -> std::io::Result<String> {
async fn generate_javascript_crates_index(
crates: Vec<Crate>,
mapping_minifier: &MappingMinifier,
) -> std::io::Result<String> {
let mut contents = String::from("var N=null;");
let crates_map: HashMap<String, [Option<String>; 3]> = crates
.into_iter()
@ -127,7 +76,7 @@ async fn generate_javascript_crates_index(crates: Vec<Crate>) -> std::io::Result
(
item.id.to_lowercase(),
[
item.description.map(minify::minify_description),
item.description.map(|value| mapping_minifier.minify(value)),
item.documentation.map(minify::minify_url),
item.max_version,
],
@ -166,9 +115,9 @@ async fn main() -> std::io::Result<()> {
})
.collect();
// Extract frequency word mapping
let mapping = MappingGenerator::new(&STRING_VEC.read().unwrap(), 25).generate_mapping();
println!("{:?}", mapping);
let contents = generate_javascript_crates_index(crates).await?;
let mapping_minifier = MappingMinifier::new(&STRING_VEC.read().unwrap(), 25);
println!("{:?}", mapping_minifier);
let contents = generate_javascript_crates_index(crates, &mapping_minifier).await?;
fs::write(path, &contents)?;
println!("\nGenerate javascript crates index successful!");
Ok(())

View file

@ -1,19 +1,71 @@
use std::collections::HashMap;
use std::ops::Deref;
use minifier::js::{
aggregate_strings_into_array_filter, simple_minify, Keyword, ReservedChar, Token, Tokens,
};
use unicode_segmentation::UnicodeSegmentation;
pub(crate) fn minify_description(mut value: String) -> String {
value.truncate(100);
value
.replace("Rust", "$R")
.replace("rust", "$r")
.replace("library", "$l")
.replace("Library", "$L")
.replace("Google", "$G")
.replace("implementation", "$i")
.replace("binding", "$b")
.replace("support", "$s")
.replace("crate", "$c")
#[derive(Debug)]
struct FrequencyWord {
word: String,
frequency: usize,
}
#[derive(Debug)]
pub(crate) struct MappingMinifier {
mapping: HashMap<String, String>,
}
impl MappingMinifier {
const UPPERCASE_LETTERS: &'static str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
pub fn new(words: &Vec<String>, top: usize) -> MappingMinifier {
assert!(top < Self::UPPERCASE_LETTERS.len());
let mut mapping: HashMap<String, usize> = HashMap::new();
words
.iter()
.flat_map(|sentence| {
sentence
.unicode_words()
.into_iter()
.filter(|word| word.len() >= 5)
.collect::<Vec<&str>>()
})
.for_each(|word| {
let count = mapping.entry(word.to_string()).or_insert(0);
*count += 1;
});
let mut frequency_words = mapping
.into_iter()
.map(|(word, frequency)| FrequencyWord { word, frequency })
.collect::<Vec<FrequencyWord>>();
frequency_words.sort_by(|a, b| b.frequency.cmp(&a.frequency));
let words = frequency_words
.drain(0..=top)
.collect::<Vec<FrequencyWord>>();
MappingMinifier {
mapping: words
.iter()
.enumerate()
.map(|(index, fw)| {
(
fw.word.clone(),
format!("${}", Self::UPPERCASE_LETTERS.chars().nth(index).unwrap()),
)
})
.collect(),
}
}
pub fn minify(&self, value: String) -> String {
value
.split_word_bounds()
.into_iter()
.map(|item| self.mapping.get(item).map(Deref::deref).unwrap_or(item))
.collect()
}
}
pub(crate) fn minify_url(url: String) -> String {