feat: detect_material_type - file type identification via magic bytes + MIME + extension

This commit is contained in:
wangdl 2026-05-30 20:34:30 +08:00
parent a0aaf9b713
commit 43010221ff
2 changed files with 191 additions and 0 deletions

View File

@ -6,3 +6,5 @@ edition = "2021"
[dependencies] [dependencies]
serde = { version = "1", features = ["derive"] } serde = { version = "1", features = ["derive"] }
serde_json = "1" serde_json = "1"
infer = "0.16"
mime_guess = "2"

View File

@ -1,5 +1,9 @@
use std::path::Path;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::error::DocumentError;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum MaterialType { pub enum MaterialType {
Markdown, Markdown,
@ -20,3 +24,188 @@ pub enum PreviewMode {
ExternalOpen, ExternalOpen,
Unsupported, Unsupported,
} }
impl MaterialType {
pub fn preview_mode(&self) -> PreviewMode {
match self {
MaterialType::Markdown | MaterialType::Text | MaterialType::Image => {
PreviewMode::NativeReader
}
MaterialType::Pdf | MaterialType::Word | MaterialType::Excel => {
PreviewMode::PlatformPreview
}
MaterialType::PowerPoint => PreviewMode::ExternalOpen,
MaterialType::Epub => PreviewMode::NativeReader,
MaterialType::Unknown => PreviewMode::Unsupported,
}
}
}
/// Detect MaterialType from a file path.
///
/// Strategy: magic bytes → MIME → extension
pub fn detect_material_type(file_path: &str) -> Result<MaterialType, DocumentError> {
let path = Path::new(file_path);
// 1. Read file header for magic bytes detection
if let Ok(buf) = std::fs::read(file_path) {
if let Some(info) = infer::get(&buf) {
let inferred = match info.mime_type() {
"application/pdf" => Some(MaterialType::Pdf),
"image/png" | "image/jpeg" | "image/webp" | "image/gif" => Some(MaterialType::Image),
"application/epub+zip" => Some(MaterialType::Epub),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
| "application/msword" => Some(MaterialType::Word),
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
| "application/vnd.ms-excel" => Some(MaterialType::Excel),
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
| "application/vnd.ms-powerpoint" => Some(MaterialType::PowerPoint),
_ => None,
};
if let Some(mt) = inferred {
return Ok(mt);
}
}
}
// 2. Try MIME guess from extension
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
let mime = mime_guess::from_ext(ext).first_or_octet_stream();
match mime.type_() {
mime_guess::mime::TEXT => {
if mime.subtype().as_str() == "markdown"
|| mime.subtype().as_str() == "x-markdown"
{
return Ok(MaterialType::Markdown);
}
return match ext {
"md" | "markdown" => Ok(MaterialType::Markdown),
"txt" | "text" => Ok(MaterialType::Text),
"html" | "htm" | "css" | "js" | "ts" | "rs" | "py" | "java"
| "c" | "cpp" | "h" | "hpp" | "swift" | "kt" | "xml" | "json"
| "yaml" | "yml" | "toml" => Ok(MaterialType::Text),
_ => Ok(MaterialType::Unknown),
};
}
mime_guess::mime::APPLICATION => {
return match ext {
"pdf" => Ok(MaterialType::Pdf),
"epub" => Ok(MaterialType::Epub),
"doc" | "docx" => Ok(MaterialType::Word),
"xls" | "xlsx" => Ok(MaterialType::Excel),
"ppt" | "pptx" => Ok(MaterialType::PowerPoint),
_ => Ok(MaterialType::Unknown),
};
}
mime_guess::mime::IMAGE => return Ok(MaterialType::Image),
_ => {}
}
}
// 3. Extension fallback
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
return Ok(match ext {
"md" | "markdown" => MaterialType::Markdown,
"txt" | "text" => MaterialType::Text,
"pdf" => MaterialType::Pdf,
"png" | "jpg" | "jpeg" | "webp" | "gif" | "bmp" | "svg" => MaterialType::Image,
"epub" => MaterialType::Epub,
"doc" | "docx" => MaterialType::Word,
"xls" | "xlsx" => MaterialType::Excel,
"ppt" | "pptx" => MaterialType::PowerPoint,
_ => MaterialType::Unknown,
});
}
Ok(MaterialType::Unknown)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_markdown() {
let t = detect_material_type("test.md").unwrap();
assert_eq!(t, MaterialType::Markdown);
}
#[test]
fn test_detect_txt() {
let t = detect_material_type("notes.txt").unwrap();
assert_eq!(t, MaterialType::Text);
}
#[test]
fn test_detect_pdf() {
let t = detect_material_type("doc.pdf").unwrap();
assert_eq!(t, MaterialType::Pdf);
}
#[test]
fn test_detect_image() {
assert_eq!(
detect_material_type("photo.png").unwrap(),
MaterialType::Image
);
assert_eq!(
detect_material_type("photo.jpg").unwrap(),
MaterialType::Image
);
assert_eq!(
detect_material_type("photo.jpeg").unwrap(),
MaterialType::Image
);
}
#[test]
fn test_detect_epub() {
let t = detect_material_type("book.epub").unwrap();
assert_eq!(t, MaterialType::Epub);
}
#[test]
fn test_detect_office() {
assert_eq!(
detect_material_type("report.docx").unwrap(),
MaterialType::Word
);
assert_eq!(
detect_material_type("sheet.xlsx").unwrap(),
MaterialType::Excel
);
assert_eq!(
detect_material_type("deck.pptx").unwrap(),
MaterialType::PowerPoint
);
}
#[test]
fn test_detect_unknown() {
let t = detect_material_type("weird.xyz").unwrap();
assert_eq!(t, MaterialType::Unknown);
}
#[test]
fn test_detect_no_extension() {
let t = detect_material_type("README").unwrap();
assert_eq!(t, MaterialType::Unknown);
}
#[test]
fn test_preview_mode() {
assert_eq!(
MaterialType::Markdown.preview_mode(),
PreviewMode::NativeReader
);
assert_eq!(MaterialType::Pdf.preview_mode(), PreviewMode::PlatformPreview);
assert_eq!(
MaterialType::PowerPoint.preview_mode(),
PreviewMode::ExternalOpen
);
assert_eq!(
MaterialType::Unknown.preview_mode(),
PreviewMode::Unsupported
);
}
}