353 lines
11 KiB
Rust

use comrak::nodes::{AstNode, NodeValue};
use comrak::{Arena, ComrakOptions};
use uuid::Uuid;
use crate::blocks::DocumentBlock;
use crate::error::DocumentError;
fn block_id() -> String {
Uuid::new_v4().to_string()
}
fn gfm_options() -> ComrakOptions<'static> {
let mut opts = ComrakOptions::default();
opts.extension.table = true;
opts.extension.strikethrough = true;
opts.extension.tagfilter = true;
opts.extension.tasklist = true;
opts
}
/// Parse a Markdown string into a list of DocumentBlock.
pub fn parse_markdown(md_content: &str) -> Result<Vec<DocumentBlock>, DocumentError> {
let arena = Arena::new();
let options = gfm_options();
let root = comrak::parse_document(&arena, md_content, &options);
let mut blocks = Vec::new();
collect_blocks(root, &mut blocks);
Ok(blocks)
}
fn collect_blocks<'a>(node: &'a AstNode<'a>, blocks: &mut Vec<DocumentBlock>) {
for child in node.children() {
let data = child.data.borrow();
let value = &data.value;
let block = match value {
NodeValue::Heading(heading) => Some(DocumentBlock::Heading {
id: block_id(),
level: heading.level,
text: collect_text(child),
}),
NodeValue::Paragraph => {
// Check for inline image — extract as standalone Image block
if let Some(image_block) = extract_image_from_paragraph(child) {
Some(image_block)
} else {
let text = collect_text(child);
if text.is_empty() {
None
} else {
Some(DocumentBlock::Paragraph {
id: block_id(),
text,
})
}
}
}
NodeValue::List(list) => {
let items: Vec<String> = child
.children()
.filter_map(|item| {
if matches!(item.data.borrow().value, NodeValue::Item(_)) {
Some(collect_text(item))
} else {
None
}
})
.collect();
if items.is_empty() {
None
} else {
Some(DocumentBlock::List {
id: block_id(),
ordered: list.list_type == comrak::nodes::ListType::Ordered,
items,
})
}
}
NodeValue::CodeBlock(code) => Some(DocumentBlock::CodeBlock {
id: block_id(),
language: if code.info.is_empty() {
None
} else {
Some(code.info.clone())
},
code: code.literal.clone(),
}),
NodeValue::BlockQuote => {
let text = collect_text(child);
if text.is_empty() {
None
} else {
Some(DocumentBlock::Quote {
id: block_id(),
text,
})
}
}
NodeValue::Table(_) => {
let mut headers = Vec::new();
let mut rows = Vec::new();
for row_node in child.children() {
let cells: Vec<String> = row_node
.children()
.map(|cell| collect_text(cell))
.collect();
if cells.is_empty() {
continue;
}
// Skip separator rows like |---|---|
if cells.iter().all(|c| c.chars().all(|ch| ch == '-' || ch == ':' || ch == ' ')) {
continue;
}
if headers.is_empty() {
headers = cells;
} else {
rows.push(cells);
}
}
if headers.is_empty() {
None
} else {
Some(DocumentBlock::Table {
id: block_id(),
headers,
rows,
})
}
}
NodeValue::Image(image) => {
let alt_text = collect_text(child);
Some(DocumentBlock::Image {
id: block_id(),
src: image.url.clone(),
alt: if alt_text.is_empty() {
None
} else {
Some(alt_text)
},
})
}
NodeValue::ThematicBreak => Some(DocumentBlock::HorizontalRule {
id: block_id(),
}),
// Recurse into containers: Document, Item, TableCell, TableRow, etc.
NodeValue::Document
| NodeValue::Item(_)
| NodeValue::TableCell
| NodeValue::TableRow(_)
| NodeValue::DescriptionList
| NodeValue::DescriptionItem(_)
| NodeValue::DescriptionTerm
| NodeValue::DescriptionDetails => {
collect_blocks(child, blocks);
None
}
_ => None,
};
if let Some(b) = block {
blocks.push(b);
}
}
}
/// Check if a paragraph contains only an image, and extract it.
fn extract_image_from_paragraph<'a>(node: &'a AstNode<'a>) -> Option<DocumentBlock> {
let mut image_node: Option<&AstNode> = None;
let mut has_other_content = false;
for child in node.children() {
let data = child.data.borrow();
match &data.value {
NodeValue::Image(_) => {
image_node = Some(child);
}
NodeValue::Text(t) if t.trim().is_empty() => {}
NodeValue::SoftBreak | NodeValue::LineBreak => {}
_ => {
has_other_content = true;
}
}
}
if let Some(img) = image_node {
if !has_other_content {
let data = img.data.borrow();
if let NodeValue::Image(image) = &data.value {
let alt_text = collect_text(img);
return Some(DocumentBlock::Image {
id: block_id(),
src: image.url.clone(),
alt: if alt_text.is_empty() {
None
} else {
Some(alt_text)
},
});
}
}
}
None
}
/// Extract plain text from all text nodes within a subtree.
fn collect_text<'a>(node: &'a AstNode<'a>) -> String {
let mut text = String::new();
collect_text_inner(node, &mut text);
text.trim().to_string()
}
fn collect_text_inner<'a>(node: &'a AstNode<'a>, buf: &mut String) {
for child in node.children() {
let data = child.data.borrow();
match &data.value {
NodeValue::Text(t) => buf.push_str(t),
NodeValue::SoftBreak => buf.push(' '),
NodeValue::LineBreak => buf.push('\n'),
NodeValue::Code(code) => buf.push_str(&code.literal),
_ => collect_text_inner(child, buf),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_markdown() {
let blocks = parse_markdown("").unwrap();
assert!(blocks.is_empty());
}
#[test]
fn test_headings() {
let md = "# Title\n\n## Section\n\n### Sub";
let blocks = parse_markdown(md).unwrap();
let headings: Vec<_> = blocks
.iter()
.filter(|b| matches!(b, DocumentBlock::Heading { .. }))
.collect();
assert_eq!(headings.len(), 3);
}
#[test]
fn test_paragraph() {
let md = "Hello world.\n\nThis is a paragraph.";
let blocks = parse_markdown(md).unwrap();
let paras: Vec<_> = blocks
.iter()
.filter(|b| matches!(b, DocumentBlock::Paragraph { .. }))
.collect();
assert_eq!(paras.len(), 2);
}
#[test]
fn test_unordered_list() {
let md = "- one\n- two\n- three";
let blocks = parse_markdown(md).unwrap();
if let Some(DocumentBlock::List { items, ordered, .. }) = blocks.first() {
assert!(!ordered);
assert_eq!(items.len(), 3);
assert_eq!(items[0], "one");
} else {
panic!("expected a list block");
}
}
#[test]
fn test_ordered_list() {
let md = "1. first\n2. second\n3. third";
let blocks = parse_markdown(md).unwrap();
if let Some(DocumentBlock::List { items, ordered, .. }) = blocks.first() {
assert!(*ordered);
assert_eq!(items.len(), 3);
} else {
panic!("expected an ordered list block");
}
}
#[test]
fn test_code_block() {
let md = "```rust\nfn main() {}\n```";
let blocks = parse_markdown(md).unwrap();
if let Some(DocumentBlock::CodeBlock { language, code, .. }) = blocks.first() {
assert_eq!(language.as_deref(), Some("rust"));
assert!(code.contains("fn main()"));
} else {
panic!("expected a code block");
}
}
#[test]
fn test_blockquote() {
let md = "> This is a quote";
let blocks = parse_markdown(md).unwrap();
if let Some(DocumentBlock::Quote { text, .. }) = blocks.first() {
assert!(text.contains("This is a quote"));
} else {
panic!("expected a quote");
}
}
#[test]
fn test_table() {
// GFM table with separator row
let md = "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |";
let blocks = parse_markdown(md).unwrap();
let tables: Vec<_> = blocks
.iter()
.filter(|b| matches!(b, DocumentBlock::Table { .. }))
.collect();
assert!(!tables.is_empty(), "should have at least one table, got {:?}", blocks);
}
#[test]
fn test_horizontal_rule() {
let md = "---";
let blocks = parse_markdown(md).unwrap();
assert!(matches!(
blocks.first(),
Some(DocumentBlock::HorizontalRule { .. })
));
}
#[test]
fn test_image() {
let md = "![alt text](https://example.com/img.png)";
let blocks = parse_markdown(md).unwrap();
if let Some(DocumentBlock::Image { src, alt, .. }) = blocks.first() {
assert_eq!(src, "https://example.com/img.png");
assert_eq!(alt.as_deref(), Some("alt text"));
} else {
panic!("expected an image block");
}
}
#[test]
fn test_complex_document() {
let md = "# Heading\n\nParagraph text.\n\n- item 1\n- item 2\n\n```rs\nlet x = 1;\n```";
let blocks = parse_markdown(md).unwrap();
assert!(blocks.len() >= 4);
assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::Heading { .. })));
assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::Paragraph { .. })));
assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::List { .. })));
assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::CodeBlock { .. })));
}
}