{"id":"mineru-tianshu","name":"Tianshu (天枢)","homepage":"https://github.com/magicyuan876/mineru-tianshu","repo_url":"https://github.com/magicyuan876/mineru-tianshu","category":"data-processing","subcategories":["document-parsing","ocr","rag-tooling"],"tags":["pdf","ocr","markdown","rag","document-processing","mcp","enterprise","multimodal"],"what_it_does":"Tianshu is an enterprise AI data preprocessing platform that converts unstructured documents (PDF, Word, Excel, PPT), images, audio, and video into AI-ready Markdown/JSON formats using MinerU and PaddleOCR-VL engines. It exposes document parsing capabilities via an MCP server for integration with AI assistants.","use_cases":["Preparing large document corpora for RAG pipelines","Enterprise document digitization and OCR at scale (109+ languages)","Integrating document parsing into AI assistant workflows via MCP","Bioinformatics data extraction from FASTA and GenBank files","Audio/video transcription with speaker identification for knowledge bases"],"not_for":["Simple single-file PDF text extraction (overkill)","Teams without Docker/GPU infrastructure","Real-time sub-second document processing requirements"],"best_when":"When you need enterprise-grade, multi-format document ingestion with GPU acceleration and role-based access control for RAG or data pipeline work.","avoid_when":"When you need a lightweight, cloud-hosted solution with no self-hosting overhead or when you only process a handful of documents occasionally.","alternatives":["docling","unstructured-io","llamaparse","marker"],"af_score":60.2,"security_score":65.0,"reliability_score":null,"package_type":"mcp_server","discovery_source":["github"],"priority":"low","status":"evaluated","version_evaluated":"latest","last_evaluated":"2026-03-01T09:50:05.948207+00:00","performance":{"latency_p50_ms":null,"latency_p99_ms":null,"uptime_sla_percent":null,"rate_limits":null,"data_source":"llm_estimated","measured_on":null}}