feat: detect_material_type - file type identification via magic bytes + MIME + extension
This commit is contained in:
parent
a0aaf9b713
commit
43010221ff
@ -6,3 +6,5 @@ edition = "2021"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
|
infer = "0.16"
|
||||||
|
mime_guess = "2"
|
||||||
|
|||||||
@ -1,5 +1,9 @@
|
|||||||
|
use std::path::Path;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::error::DocumentError;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub enum MaterialType {
|
pub enum MaterialType {
|
||||||
Markdown,
|
Markdown,
|
||||||
@ -20,3 +24,188 @@ pub enum PreviewMode {
|
|||||||
ExternalOpen,
|
ExternalOpen,
|
||||||
Unsupported,
|
Unsupported,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl MaterialType {
|
||||||
|
pub fn preview_mode(&self) -> PreviewMode {
|
||||||
|
match self {
|
||||||
|
MaterialType::Markdown | MaterialType::Text | MaterialType::Image => {
|
||||||
|
PreviewMode::NativeReader
|
||||||
|
}
|
||||||
|
MaterialType::Pdf | MaterialType::Word | MaterialType::Excel => {
|
||||||
|
PreviewMode::PlatformPreview
|
||||||
|
}
|
||||||
|
MaterialType::PowerPoint => PreviewMode::ExternalOpen,
|
||||||
|
MaterialType::Epub => PreviewMode::NativeReader,
|
||||||
|
MaterialType::Unknown => PreviewMode::Unsupported,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect MaterialType from a file path.
|
||||||
|
///
|
||||||
|
/// Strategy: magic bytes → MIME → extension
|
||||||
|
pub fn detect_material_type(file_path: &str) -> Result<MaterialType, DocumentError> {
|
||||||
|
let path = Path::new(file_path);
|
||||||
|
|
||||||
|
// 1. Read file header for magic bytes detection
|
||||||
|
if let Ok(buf) = std::fs::read(file_path) {
|
||||||
|
if let Some(info) = infer::get(&buf) {
|
||||||
|
let inferred = match info.mime_type() {
|
||||||
|
"application/pdf" => Some(MaterialType::Pdf),
|
||||||
|
"image/png" | "image/jpeg" | "image/webp" | "image/gif" => Some(MaterialType::Image),
|
||||||
|
"application/epub+zip" => Some(MaterialType::Epub),
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
| "application/msword" => Some(MaterialType::Word),
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
| "application/vnd.ms-excel" => Some(MaterialType::Excel),
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
|
| "application/vnd.ms-powerpoint" => Some(MaterialType::PowerPoint),
|
||||||
|
_ => None,
|
||||||
|
};
|
||||||
|
if let Some(mt) = inferred {
|
||||||
|
return Ok(mt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Try MIME guess from extension
|
||||||
|
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||||
|
let mime = mime_guess::from_ext(ext).first_or_octet_stream();
|
||||||
|
match mime.type_() {
|
||||||
|
mime_guess::mime::TEXT => {
|
||||||
|
if mime.subtype().as_str() == "markdown"
|
||||||
|
|| mime.subtype().as_str() == "x-markdown"
|
||||||
|
{
|
||||||
|
return Ok(MaterialType::Markdown);
|
||||||
|
}
|
||||||
|
return match ext {
|
||||||
|
"md" | "markdown" => Ok(MaterialType::Markdown),
|
||||||
|
"txt" | "text" => Ok(MaterialType::Text),
|
||||||
|
"html" | "htm" | "css" | "js" | "ts" | "rs" | "py" | "java"
|
||||||
|
| "c" | "cpp" | "h" | "hpp" | "swift" | "kt" | "xml" | "json"
|
||||||
|
| "yaml" | "yml" | "toml" => Ok(MaterialType::Text),
|
||||||
|
_ => Ok(MaterialType::Unknown),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
mime_guess::mime::APPLICATION => {
|
||||||
|
return match ext {
|
||||||
|
"pdf" => Ok(MaterialType::Pdf),
|
||||||
|
"epub" => Ok(MaterialType::Epub),
|
||||||
|
"doc" | "docx" => Ok(MaterialType::Word),
|
||||||
|
"xls" | "xlsx" => Ok(MaterialType::Excel),
|
||||||
|
"ppt" | "pptx" => Ok(MaterialType::PowerPoint),
|
||||||
|
_ => Ok(MaterialType::Unknown),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
mime_guess::mime::IMAGE => return Ok(MaterialType::Image),
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Extension fallback
|
||||||
|
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||||
|
return Ok(match ext {
|
||||||
|
"md" | "markdown" => MaterialType::Markdown,
|
||||||
|
"txt" | "text" => MaterialType::Text,
|
||||||
|
"pdf" => MaterialType::Pdf,
|
||||||
|
"png" | "jpg" | "jpeg" | "webp" | "gif" | "bmp" | "svg" => MaterialType::Image,
|
||||||
|
"epub" => MaterialType::Epub,
|
||||||
|
"doc" | "docx" => MaterialType::Word,
|
||||||
|
"xls" | "xlsx" => MaterialType::Excel,
|
||||||
|
"ppt" | "pptx" => MaterialType::PowerPoint,
|
||||||
|
_ => MaterialType::Unknown,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(MaterialType::Unknown)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_markdown() {
|
||||||
|
let t = detect_material_type("test.md").unwrap();
|
||||||
|
assert_eq!(t, MaterialType::Markdown);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_txt() {
|
||||||
|
let t = detect_material_type("notes.txt").unwrap();
|
||||||
|
assert_eq!(t, MaterialType::Text);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_pdf() {
|
||||||
|
let t = detect_material_type("doc.pdf").unwrap();
|
||||||
|
assert_eq!(t, MaterialType::Pdf);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_image() {
|
||||||
|
assert_eq!(
|
||||||
|
detect_material_type("photo.png").unwrap(),
|
||||||
|
MaterialType::Image
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
detect_material_type("photo.jpg").unwrap(),
|
||||||
|
MaterialType::Image
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
detect_material_type("photo.jpeg").unwrap(),
|
||||||
|
MaterialType::Image
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_epub() {
|
||||||
|
let t = detect_material_type("book.epub").unwrap();
|
||||||
|
assert_eq!(t, MaterialType::Epub);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_office() {
|
||||||
|
assert_eq!(
|
||||||
|
detect_material_type("report.docx").unwrap(),
|
||||||
|
MaterialType::Word
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
detect_material_type("sheet.xlsx").unwrap(),
|
||||||
|
MaterialType::Excel
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
detect_material_type("deck.pptx").unwrap(),
|
||||||
|
MaterialType::PowerPoint
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_unknown() {
|
||||||
|
let t = detect_material_type("weird.xyz").unwrap();
|
||||||
|
assert_eq!(t, MaterialType::Unknown);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_no_extension() {
|
||||||
|
let t = detect_material_type("README").unwrap();
|
||||||
|
assert_eq!(t, MaterialType::Unknown);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_preview_mode() {
|
||||||
|
assert_eq!(
|
||||||
|
MaterialType::Markdown.preview_mode(),
|
||||||
|
PreviewMode::NativeReader
|
||||||
|
);
|
||||||
|
assert_eq!(MaterialType::Pdf.preview_mode(), PreviewMode::PlatformPreview);
|
||||||
|
assert_eq!(
|
||||||
|
MaterialType::PowerPoint.preview_mode(),
|
||||||
|
PreviewMode::ExternalOpen
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
MaterialType::Unknown.preview_mode(),
|
||||||
|
PreviewMode::Unsupported
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user