最近在学 Rust, 感觉非常好用~
回复 7# 523066680
诶, 这位我有印象. 没想到竟然是 CU 的版主
我最终还是找了个 API 很难用的 crate 改了下, 先用起来再说
这次 100MB 用时 4.4s, 最高内存占用 200MB
看起来内存占用并没有减少, 嘛...因为文件大小太小了看不出来, 我也没耐心弄个几G的文件测试
加上多线程应该还能快一点, 不过试了下加上效果也不是很明显, 估计瓶颈应该在 IO 上 (也有可能是文件太小了...
主体代码如下, 完整代码传百度云了(论坛上传竟然要 Flash 支持?...
链接: https://pan.baidu.com/s/1lvI4UPwOclizsfp108j0EQ 提取码: s39j- use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
- use extsort::*;
- use std::collections::HashMap;
- use std::fs::File;
- use std::io::{self, BufRead, BufReader, Read, Write};
-
- /// 从字符串中提取出"字符串"
- fn get_keywords(text: &str) -> Vec<&str> {
- #[inline]
- fn type_of(b: u8) -> u8 {
- if b.is_ascii_alphabetic() {
- return 1;
- } else if b.is_ascii_digit() {
- return 2;
- } else {
- return 3;
- }
- }
-
- let bytes = text.as_bytes();
- let mut ret = vec![];
- let mut pos = 0;
- for i in 0..bytes.len() - 1 {
- if bytes[i].is_ascii_alphanumeric() && type_of(bytes[i]) != type_of(bytes[i + 1]) {
- ret.push(&text[pos..=i]);
- pos = i + 1;
- } else if !bytes[i].is_ascii_alphanumeric() {
- pos = i + 1;
- }
- }
- if bytes[pos].is_ascii_alphanumeric() && type_of(bytes[pos]) == type_of(bytes[bytes.len() - 1])
- {
- ret.push(&text[pos..]);
- }
- ret
- }
-
- /// 读取文件, 确定每个"字符串"出现的次数
- fn make_keyword_map(path: &str) -> HashMap<String, u32> {
- let file = BufReader::new(File::open(path).expect("无法打开文件"));
- let mut ret = HashMap::new();
-
- for line in file.lines() {
- let line = line.unwrap();
- for keyword in get_keywords(&line) {
- *ret.entry(keyword.to_owned()).or_insert(0) += 1;
- }
- }
-
- ret
- }
-
- /// 根据每行的字符串出现次数生成一个"特征值"
- fn make_line_map(
- path: &str,
- keyword_map: &HashMap<String, u32>,
- ) -> (usize, HashMap<Vec<u32>, Vec<u32>>) {
- let file = BufReader::new(File::open(path).expect("无法打开文件"));
- let mut line_cnt = 0;
- let mut ret = HashMap::new();
-
- for (idx, line) in file.lines().enumerate() {
- let line = line.unwrap();
- let keywords = get_keywords(&line);
-
- let mut occurs_cnt = keywords
- .iter()
- .map(|&s| *keyword_map.get(s).unwrap())
- .collect::<Vec<_>>();
- occurs_cnt.sort_unstable_by(|a, b| b.cmp(a));
-
- ret.entry(occurs_cnt).or_insert(vec![]).push(idx as u32);
- line_cnt = idx;
- }
-
- (line_cnt, ret)
- }
-
- #[derive(Debug, Ord, PartialOrd, Eq, PartialEq)]
- struct Line(u32, String);
-
- impl Sortable<Line> for Line {
- fn encode(item: Line, write: &mut Write) {
- write.write_u32::<LittleEndian>(item.0).unwrap();
- write.write(item.1.as_bytes()).unwrap();
- write.write(&[b'\n']).unwrap();
- }
-
- fn decode(read: &mut Read) -> Option<Line> {
- let idx = read.read_u32::<LittleEndian>().ok()?;
- let mut bytes = read.bytes();
- let s = String::from_utf8(
- bytes
- .by_ref()
- .map(|b| b.unwrap())
- .take_while(|b| *b != b'\n')
- .collect(),
- )
- .unwrap();
- // eprintln!("{} {}", idx, s);
- Some(Line(idx, s))
- }
- }
-
- fn main() {
- let path = "a.txt";
-
- let keyword_map = make_keyword_map(path);
- let (line_cnt, line_map) = make_line_map(path, &keyword_map);
-
- // 排序
- let mut keys = line_map.keys().collect::<Vec<_>>();
- keys.sort_unstable_by(|a, b| b.cmp(a));
-
- // 确定每行的顺序
- let mut line_order = vec![0; line_cnt + 1]; // TODO: 我没记错的话当 line_cnt 很大时会出问题
- let mut order = 0 as u32;
- for key in keys {
- let idxs = line_map.get(key).unwrap();
- for &idx in idxs {
- line_order[idx as usize] = order;
- order += 1;
- }
- }
-
- // 进行外排序
- let file = BufReader::new(File::open(path).expect("无法打开文件"));
- let mut sorter = ExternalSorter::new();
- sorter.set_max_size(1 * 1024 * 1024 * 1024 / 128); // 按每行 128 字节算, 1G 内存每次最多载入多少行
- let sorted_iter = sorter
- .sort_by(
- file.lines()
- .enumerate()
- .map(|(idx, s)| Line(idx as u32, s.unwrap())),
- |a, b| line_order[a.0 as usize].cmp(&line_order[b.0 as usize]),
- )
- .unwrap();
-
- // 输出
- let stdout = io::stdout();
- let mut handle = stdout.lock();
- for line in sorted_iter {
- handle.write(line.1.as_bytes()).unwrap();
- handle.write(&[b'\n']).unwrap();
- }
- }
复制代码
|