From c390718c157bd99e69f8636cc65db0a67f1cb4bf Mon Sep 17 00:00:00 2001
From: wangdl <wangdl@longde.cloud>
Date: Sat, 30 May 2026 20:50:54 +0800
Subject: [PATCH] feat: Markdown parsing with comrak - GFM tables,
 strikethrough, tasklist, all 11 block types

---
 crates/zx_document_core/Cargo.toml      |   2 +
 crates/zx_document_core/src/markdown.rs | 352 ++++++++++++++++++++++++
 2 files changed, 354 insertions(+)
diff --git a/crates/zx_document_core/Cargo.toml b/crates/zx_document_core/Cargo.toml
index 70bb2a4..b3bc1cc 100644
--- a/crates/zx_document_core/Cargo.toml
+++ b/crates/zx_document_core/Cargo.toml
@@ -8,3 +8,5 @@ serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 infer = "0.16"
 mime_guess = "2"
+comrak = "0.29"
+uuid = { version = "1", features = ["v4"] }
diff --git a/crates/zx_document_core/src/markdown.rs b/crates/zx_document_core/src/markdown.rs
index e69de29..4af81a3 100644
--- a/crates/zx_document_core/src/markdown.rs
+++ b/crates/zx_document_core/src/markdown.rs
@@ -0,0 +1,352 @@
+use comrak::nodes::{AstNode, NodeValue};
+use comrak::{Arena, ComrakOptions};
+use uuid::Uuid;
+
+use crate::blocks::DocumentBlock;
+use crate::error::DocumentError;
+
+fn block_id() -> String {
+    Uuid::new_v4().to_string()
+}
+
+fn gfm_options() -> ComrakOptions<'static> {
+    let mut opts = ComrakOptions::default();
+    opts.extension.table = true;
+    opts.extension.strikethrough = true;
+    opts.extension.tagfilter = true;
+    opts.extension.tasklist = true;
+    opts
+}
+
+/// Parse a Markdown string into a list of DocumentBlock.
+pub fn parse_markdown(md_content: &str) -> Result<Vec<DocumentBlock>, DocumentError> {
+    let arena = Arena::new();
+    let options = gfm_options();
+    let root = comrak::parse_document(&arena, md_content, &options);
+
+    let mut blocks = Vec::new();
+    collect_blocks(root, &mut blocks);
+    Ok(blocks)
+}
+
+fn collect_blocks<'a>(node: &'a AstNode<'a>, blocks: &mut Vec<DocumentBlock>) {
+    for child in node.children() {
+        let data = child.data.borrow();
+        let value = &data.value;
+
+        let block = match value {
+            NodeValue::Heading(heading) => Some(DocumentBlock::Heading {
+                id: block_id(),
+                level: heading.level,
+                text: collect_text(child),
+            }),
+            NodeValue::Paragraph => {
+                // Check for inline image — extract as standalone Image block
+                if let Some(image_block) = extract_image_from_paragraph(child) {
+                    Some(image_block)
+                } else {
+                    let text = collect_text(child);
+                    if text.is_empty() {
+                        None
+                    } else {
+                        Some(DocumentBlock::Paragraph {
+                            id: block_id(),
+                            text,
+                        })
+                    }
+                }
+            }
+            NodeValue::List(list) => {
+                let items: Vec<String> = child
+                    .children()
+                    .filter_map(|item| {
+                        if matches!(item.data.borrow().value, NodeValue::Item(_)) {
+                            Some(collect_text(item))
+                        } else {
+                            None
+                        }
+                    })
+                    .collect();
+                if items.is_empty() {
+                    None
+                } else {
+                    Some(DocumentBlock::List {
+                        id: block_id(),
+                        ordered: list.list_type == comrak::nodes::ListType::Ordered,
+                        items,
+                    })
+                }
+            }
+            NodeValue::CodeBlock(code) => Some(DocumentBlock::CodeBlock {
+                id: block_id(),
+                language: if code.info.is_empty() {
+                    None
+                } else {
+                    Some(code.info.clone())
+                },
+                code: code.literal.clone(),
+            }),
+            NodeValue::BlockQuote => {
+                let text = collect_text(child);
+                if text.is_empty() {
+                    None
+                } else {
+                    Some(DocumentBlock::Quote {
+                        id: block_id(),
+                        text,
+                    })
+                }
+            }
+            NodeValue::Table(_) => {
+                let mut headers = Vec::new();
+                let mut rows = Vec::new();
+
+                for row_node in child.children() {
+                    let cells: Vec<String> = row_node
+                        .children()
+                        .map(|cell| collect_text(cell))
+                        .collect();
+                    if cells.is_empty() {
+                        continue;
+                    }
+                    // Skip separator rows like |---|---|
+                    if cells.iter().all(|c| c.chars().all(|ch| ch == '-' || ch == ':' || ch == ' ')) {
+                        continue;
+                    }
+                    if headers.is_empty() {
+                        headers = cells;
+                    } else {
+                        rows.push(cells);
+                    }
+                }
+
+                if headers.is_empty() {
+                    None
+                } else {
+                    Some(DocumentBlock::Table {
+                        id: block_id(),
+                        headers,
+                        rows,
+                    })
+                }
+            }
+            NodeValue::Image(image) => {
+                let alt_text = collect_text(child);
+                Some(DocumentBlock::Image {
+                    id: block_id(),
+                    src: image.url.clone(),
+                    alt: if alt_text.is_empty() {
+                        None
+                    } else {
+                        Some(alt_text)
+                    },
+                })
+            }
+            NodeValue::ThematicBreak => Some(DocumentBlock::HorizontalRule {
+                id: block_id(),
+            }),
+            // Recurse into containers: Document, Item, TableCell, TableRow, etc.
+            NodeValue::Document
+            | NodeValue::Item(_)
+            | NodeValue::TableCell
+            | NodeValue::TableRow(_)
+            | NodeValue::DescriptionList
+            | NodeValue::DescriptionItem(_)
+            | NodeValue::DescriptionTerm
+            | NodeValue::DescriptionDetails => {
+                collect_blocks(child, blocks);
+                None
+            }
+            _ => None,
+        };
+
+        if let Some(b) = block {
+            blocks.push(b);
+        }
+    }
+}
+
+/// Check if a paragraph contains only an image, and extract it.
+fn extract_image_from_paragraph<'a>(node: &'a AstNode<'a>) -> Option<DocumentBlock> {
+    let mut image_node: Option<&AstNode> = None;
+    let mut has_other_content = false;
+
+    for child in node.children() {
+        let data = child.data.borrow();
+        match &data.value {
+            NodeValue::Image(_) => {
+                image_node = Some(child);
+            }
+            NodeValue::Text(t) if t.trim().is_empty() => {}
+            NodeValue::SoftBreak | NodeValue::LineBreak => {}
+            _ => {
+                has_other_content = true;
+            }
+        }
+    }
+
+    if let Some(img) = image_node {
+        if !has_other_content {
+            let data = img.data.borrow();
+            if let NodeValue::Image(image) = &data.value {
+                let alt_text = collect_text(img);
+                return Some(DocumentBlock::Image {
+                    id: block_id(),
+                    src: image.url.clone(),
+                    alt: if alt_text.is_empty() {
+                        None
+                    } else {
+                        Some(alt_text)
+                    },
+                });
+            }
+        }
+    }
+    None
+}
+
+/// Extract plain text from all text nodes within a subtree.
+fn collect_text<'a>(node: &'a AstNode<'a>) -> String {
+    let mut text = String::new();
+    collect_text_inner(node, &mut text);
+    text.trim().to_string()
+}
+
+fn collect_text_inner<'a>(node: &'a AstNode<'a>, buf: &mut String) {
+    for child in node.children() {
+        let data = child.data.borrow();
+        match &data.value {
+            NodeValue::Text(t) => buf.push_str(t),
+            NodeValue::SoftBreak => buf.push(' '),
+            NodeValue::LineBreak => buf.push('\n'),
+            NodeValue::Code(code) => buf.push_str(&code.literal),
+            _ => collect_text_inner(child, buf),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_empty_markdown() {
+        let blocks = parse_markdown("").unwrap();
+        assert!(blocks.is_empty());
+    }
+
+    #[test]
+    fn test_headings() {
+        let md = "# Title\n\n## Section\n\n### Sub";
+        let blocks = parse_markdown(md).unwrap();
+        let headings: Vec<_> = blocks
+            .iter()
+            .filter(|b| matches!(b, DocumentBlock::Heading { .. }))
+            .collect();
+        assert_eq!(headings.len(), 3);
+    }
+
+    #[test]
+    fn test_paragraph() {
+        let md = "Hello world.\n\nThis is a paragraph.";
+        let blocks = parse_markdown(md).unwrap();
+        let paras: Vec<_> = blocks
+            .iter()
+            .filter(|b| matches!(b, DocumentBlock::Paragraph { .. }))
+            .collect();
+        assert_eq!(paras.len(), 2);
+    }
+
+    #[test]
+    fn test_unordered_list() {
+        let md = "- one\n- two\n- three";
+        let blocks = parse_markdown(md).unwrap();
+        if let Some(DocumentBlock::List { items, ordered, .. }) = blocks.first() {
+            assert!(!ordered);
+            assert_eq!(items.len(), 3);
+            assert_eq!(items[0], "one");
+        } else {
+            panic!("expected a list block");
+        }
+    }
+
+    #[test]
+    fn test_ordered_list() {
+        let md = "1. first\n2. second\n3. third";
+        let blocks = parse_markdown(md).unwrap();
+        if let Some(DocumentBlock::List { items, ordered, .. }) = blocks.first() {
+            assert!(*ordered);
+            assert_eq!(items.len(), 3);
+        } else {
+            panic!("expected an ordered list block");
+        }
+    }
+
+    #[test]
+    fn test_code_block() {
+        let md = "```rust\nfn main() {}\n```";
+        let blocks = parse_markdown(md).unwrap();
+        if let Some(DocumentBlock::CodeBlock { language, code, .. }) = blocks.first() {
+            assert_eq!(language.as_deref(), Some("rust"));
+            assert!(code.contains("fn main()"));
+        } else {
+            panic!("expected a code block");
+        }
+    }
+
+    #[test]
+    fn test_blockquote() {
+        let md = "> This is a quote";
+        let blocks = parse_markdown(md).unwrap();
+        if let Some(DocumentBlock::Quote { text, .. }) = blocks.first() {
+            assert!(text.contains("This is a quote"));
+        } else {
+            panic!("expected a quote");
+        }
+    }
+
+    #[test]
+    fn test_table() {
+        // GFM table with separator row
+        let md = "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |";
+        let blocks = parse_markdown(md).unwrap();
+        let tables: Vec<_> = blocks
+            .iter()
+            .filter(|b| matches!(b, DocumentBlock::Table { .. }))
+            .collect();
+        assert!(!tables.is_empty(), "should have at least one table, got {:?}", blocks);
+    }
+
+    #[test]
+    fn test_horizontal_rule() {
+        let md = "---";
+        let blocks = parse_markdown(md).unwrap();
+        assert!(matches!(
+            blocks.first(),
+            Some(DocumentBlock::HorizontalRule { .. })
+        ));
+    }
+
+    #[test]
+    fn test_image() {
+        let md = "![alt text](https://example.com/img.png)";
+        let blocks = parse_markdown(md).unwrap();
+        if let Some(DocumentBlock::Image { src, alt, .. }) = blocks.first() {
+            assert_eq!(src, "https://example.com/img.png");
+            assert_eq!(alt.as_deref(), Some("alt text"));
+        } else {
+            panic!("expected an image block");
+        }
+    }
+
+    #[test]
+    fn test_complex_document() {
+        let md = "# Heading\n\nParagraph text.\n\n- item 1\n- item 2\n\n```rs\nlet x = 1;\n```";
+        let blocks = parse_markdown(md).unwrap();
+        assert!(blocks.len() >= 4);
+        assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::Heading { .. })));
+        assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::Paragraph { .. })));
+        assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::List { .. })));
+        assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::CodeBlock { .. })));
+    }
+}