''' Author: jackning 270580156@qq.com Date: 2024-08-29 09:55:30 LastEditors: jackning 270580156@qq.com LastEditTime: 2024-08-31 07:07:31 Description: bytedesk.com https://github.com/Bytedesk/bytedesk Please be aware of the BSL license restrictions before installing Bytedesk IM – selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售 Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE contact: 270580156@qq.com 技术/商务联系:270580156@qq.com Copyright (c) 2024 by bytedesk.com, All Rights Reserved. ''' # import logging import uuid from typing import List from fastapi import APIRouter from app.redisVector import myredisVector from langchain_community.document_loaders import ( PyPDFLoader, TextLoader, UnstructuredWordDocumentLoader, CSVLoader, UnstructuredMarkdownLoader, UnstructuredEPubLoader, UnstructuredHTMLLoader, UnstructuredImageLoader, UnstructuredExcelLoader, UnstructuredXMLLoader, UnstructuredRTFLoader ) # from langchain_community.document_loaders.unstructured import UnstructuredFileLoader from langchain_unstructured import UnstructuredLoader # from langchain_text_splitters import RecursiveCharacterTextSplitter from app.textsplitter.chinese_recursive_text_splitter import ChineseRecursiveTextSplitter from langchain.docstore.document import Document # https://python.langchain.com/v0.2/docs/integrations/document_loaders/pypdfloader/ # https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/ def load_files(filePath: str): if (filePath.endswith(".pdf")): loader = PyPDFLoader(filePath) elif (filePath.endswith(".txt")): loader = TextLoader(filePath) elif (filePath.endswith(".doc") or filePath.endswith(".docx")): loader = UnstructuredWordDocumentLoader(filePath) elif (filePath.endswith(".md")): loader = UnstructuredMarkdownLoader(filePath) elif (filePath.endswith(".html")): loader = UnstructuredHTMLLoader(filePath) elif (filePath.endswith(".png") or filePath.endswith(".jpg")) or filePath.endswith(".jpeg"): loader = UnstructuredImageLoader(filePath) elif (filePath.endswith(".xlsx")): loader = UnstructuredExcelLoader(filePath) elif (filePath.endswith(".csv")): loader = CSVLoader(filePath) elif (filePath.endswith(".xml")): loader = UnstructuredXMLLoader(filePath) elif (filePath.endswith(".rtf")): loader = UnstructuredRTFLoader(filePath) elif (filePath.endswith(".epub")): loader = UnstructuredEPubLoader(filePath) else: loader = UnstructuredLoader(filePath) docs = loader.load() # print(docs[0].metadata) return docs def split_docs(docs: List[Document]) -> List[Document]: # Load example document text_splitter = ChineseRecursiveTextSplitter( chunk_size=500, chunk_overlap=50, length_function=len, is_separator_regex=False, ) texts = text_splitter.split_documents(docs) # print(texts[0]) return texts # 定义一个回调函数来处理接收到的消息 def load_and_parse(fileUid: str, filePath: str, kbUid: str) -> List[str]: logging.info(f"load_and_parse: {filePath}") # 解析 docs = load_files(filePath) logging.info(f"Loaded {len(docs)} documents") # 分块 splited_texts = split_docs(docs) for doc in splited_texts: doc.metadata["uid"] = str(uuid.uuid4().hex) doc.metadata["file_uid"] = fileUid doc.metadata["kb_uid"] = kbUid logging.info(f"Split into {len(splited_texts)} chunks") # 存储到redis docIds = myredisVector.add_docs(splited_texts) logging.info(f"Stored in redis") return docIds router = APIRouter( prefix='/docs', tags=['docs v1 apis'] ) @router.get("/test") def test(): return {"docs": "test"}