use comrak::nodes::{AstNode, NodeValue}; use comrak::{Arena, ComrakOptions}; use uuid::Uuid; use crate::blocks::DocumentBlock; use crate::error::DocumentError; fn block_id() -> String { Uuid::new_v4().to_string() } fn gfm_options() -> ComrakOptions<'static> { let mut opts = ComrakOptions::default(); opts.extension.table = true; opts.extension.strikethrough = true; opts.extension.tagfilter = true; opts.extension.tasklist = true; opts } /// Parse a Markdown string into a list of DocumentBlock. pub fn parse_markdown(md_content: &str) -> Result, DocumentError> { let arena = Arena::new(); let options = gfm_options(); let root = comrak::parse_document(&arena, md_content, &options); let mut blocks = Vec::new(); collect_blocks(root, &mut blocks); Ok(blocks) } fn collect_blocks<'a>(node: &'a AstNode<'a>, blocks: &mut Vec) { for child in node.children() { let data = child.data.borrow(); let value = &data.value; let block = match value { NodeValue::Heading(heading) => Some(DocumentBlock::Heading { id: block_id(), level: heading.level, text: collect_text(child), }), NodeValue::Paragraph => { // Check for inline image — extract as standalone Image block if let Some(image_block) = extract_image_from_paragraph(child) { Some(image_block) } else { let text = collect_text(child); if text.is_empty() { None } else { Some(DocumentBlock::Paragraph { id: block_id(), text, }) } } } NodeValue::List(list) => { let items: Vec = child .children() .filter_map(|item| { if matches!(item.data.borrow().value, NodeValue::Item(_)) { Some(collect_text(item)) } else { None } }) .collect(); if items.is_empty() { None } else { Some(DocumentBlock::List { id: block_id(), ordered: list.list_type == comrak::nodes::ListType::Ordered, items, }) } } NodeValue::CodeBlock(code) => Some(DocumentBlock::CodeBlock { id: block_id(), language: if code.info.is_empty() { None } else { Some(code.info.clone()) }, code: code.literal.clone(), }), NodeValue::BlockQuote => { let text = collect_text(child); if text.is_empty() { None } else { Some(DocumentBlock::Quote { id: block_id(), text, }) } } NodeValue::Table(_) => { let mut headers = Vec::new(); let mut rows = Vec::new(); for row_node in child.children() { let cells: Vec = row_node .children() .map(|cell| collect_text(cell)) .collect(); if cells.is_empty() { continue; } // Skip separator rows like |---|---| if cells.iter().all(|c| c.chars().all(|ch| ch == '-' || ch == ':' || ch == ' ')) { continue; } if headers.is_empty() { headers = cells; } else { rows.push(cells); } } if headers.is_empty() { None } else { Some(DocumentBlock::Table { id: block_id(), headers, rows, }) } } NodeValue::Image(image) => { let alt_text = collect_text(child); Some(DocumentBlock::Image { id: block_id(), src: image.url.clone(), alt: if alt_text.is_empty() { None } else { Some(alt_text) }, }) } NodeValue::ThematicBreak => Some(DocumentBlock::HorizontalRule { id: block_id(), }), // Recurse into containers: Document, Item, TableCell, TableRow, etc. NodeValue::Document | NodeValue::Item(_) | NodeValue::TableCell | NodeValue::TableRow(_) | NodeValue::DescriptionList | NodeValue::DescriptionItem(_) | NodeValue::DescriptionTerm | NodeValue::DescriptionDetails => { collect_blocks(child, blocks); None } _ => None, }; if let Some(b) = block { blocks.push(b); } } } /// Check if a paragraph contains only an image, and extract it. fn extract_image_from_paragraph<'a>(node: &'a AstNode<'a>) -> Option { let mut image_node: Option<&AstNode> = None; let mut has_other_content = false; for child in node.children() { let data = child.data.borrow(); match &data.value { NodeValue::Image(_) => { image_node = Some(child); } NodeValue::Text(t) if t.trim().is_empty() => {} NodeValue::SoftBreak | NodeValue::LineBreak => {} _ => { has_other_content = true; } } } if let Some(img) = image_node { if !has_other_content { let data = img.data.borrow(); if let NodeValue::Image(image) = &data.value { let alt_text = collect_text(img); return Some(DocumentBlock::Image { id: block_id(), src: image.url.clone(), alt: if alt_text.is_empty() { None } else { Some(alt_text) }, }); } } } None } /// Extract plain text from all text nodes within a subtree. fn collect_text<'a>(node: &'a AstNode<'a>) -> String { let mut text = String::new(); collect_text_inner(node, &mut text); text.trim().to_string() } fn collect_text_inner<'a>(node: &'a AstNode<'a>, buf: &mut String) { for child in node.children() { let data = child.data.borrow(); match &data.value { NodeValue::Text(t) => buf.push_str(t), NodeValue::SoftBreak => buf.push(' '), NodeValue::LineBreak => buf.push('\n'), NodeValue::Code(code) => buf.push_str(&code.literal), _ => collect_text_inner(child, buf), } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_empty_markdown() { let blocks = parse_markdown("").unwrap(); assert!(blocks.is_empty()); } #[test] fn test_headings() { let md = "# Title\n\n## Section\n\n### Sub"; let blocks = parse_markdown(md).unwrap(); let headings: Vec<_> = blocks .iter() .filter(|b| matches!(b, DocumentBlock::Heading { .. })) .collect(); assert_eq!(headings.len(), 3); } #[test] fn test_paragraph() { let md = "Hello world.\n\nThis is a paragraph."; let blocks = parse_markdown(md).unwrap(); let paras: Vec<_> = blocks .iter() .filter(|b| matches!(b, DocumentBlock::Paragraph { .. })) .collect(); assert_eq!(paras.len(), 2); } #[test] fn test_unordered_list() { let md = "- one\n- two\n- three"; let blocks = parse_markdown(md).unwrap(); if let Some(DocumentBlock::List { items, ordered, .. }) = blocks.first() { assert!(!ordered); assert_eq!(items.len(), 3); assert_eq!(items[0], "one"); } else { panic!("expected a list block"); } } #[test] fn test_ordered_list() { let md = "1. first\n2. second\n3. third"; let blocks = parse_markdown(md).unwrap(); if let Some(DocumentBlock::List { items, ordered, .. }) = blocks.first() { assert!(*ordered); assert_eq!(items.len(), 3); } else { panic!("expected an ordered list block"); } } #[test] fn test_code_block() { let md = "```rust\nfn main() {}\n```"; let blocks = parse_markdown(md).unwrap(); if let Some(DocumentBlock::CodeBlock { language, code, .. }) = blocks.first() { assert_eq!(language.as_deref(), Some("rust")); assert!(code.contains("fn main()")); } else { panic!("expected a code block"); } } #[test] fn test_blockquote() { let md = "> This is a quote"; let blocks = parse_markdown(md).unwrap(); if let Some(DocumentBlock::Quote { text, .. }) = blocks.first() { assert!(text.contains("This is a quote")); } else { panic!("expected a quote"); } } #[test] fn test_table() { // GFM table with separator row let md = "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"; let blocks = parse_markdown(md).unwrap(); let tables: Vec<_> = blocks .iter() .filter(|b| matches!(b, DocumentBlock::Table { .. })) .collect(); assert!(!tables.is_empty(), "should have at least one table, got {:?}", blocks); } #[test] fn test_horizontal_rule() { let md = "---"; let blocks = parse_markdown(md).unwrap(); assert!(matches!( blocks.first(), Some(DocumentBlock::HorizontalRule { .. }) )); } #[test] fn test_image() { let md = "![alt text](https://example.com/img.png)"; let blocks = parse_markdown(md).unwrap(); if let Some(DocumentBlock::Image { src, alt, .. }) = blocks.first() { assert_eq!(src, "https://example.com/img.png"); assert_eq!(alt.as_deref(), Some("alt text")); } else { panic!("expected an image block"); } } #[test] fn test_complex_document() { let md = "# Heading\n\nParagraph text.\n\n- item 1\n- item 2\n\n```rs\nlet x = 1;\n```"; let blocks = parse_markdown(md).unwrap(); assert!(blocks.len() >= 4); assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::Heading { .. }))); assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::Paragraph { .. }))); assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::List { .. }))); assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::CodeBlock { .. }))); } }