Files
weiyu/modules/python/app/doc.py

112 lines
4.0 KiB
Python
Raw Normal View History

2024-12-14 10:43:18 +08:00
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 09:55:30
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-31 07:07:31
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
#
import logging
import uuid
from typing import List
from fastapi import APIRouter
from app.redisVector import myredisVector
from langchain_community.document_loaders import (
PyPDFLoader,
TextLoader,
UnstructuredWordDocumentLoader,
CSVLoader,
UnstructuredMarkdownLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredImageLoader,
UnstructuredExcelLoader,
UnstructuredXMLLoader,
UnstructuredRTFLoader
)
# from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_unstructured import UnstructuredLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
from app.textsplitter.chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
from langchain.docstore.document import Document
# https://python.langchain.com/v0.2/docs/integrations/document_loaders/pypdfloader/
# https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/
def load_files(filePath: str):
if (filePath.endswith(".pdf")):
loader = PyPDFLoader(filePath)
elif (filePath.endswith(".txt")):
loader = TextLoader(filePath)
elif (filePath.endswith(".doc") or filePath.endswith(".docx")):
loader = UnstructuredWordDocumentLoader(filePath)
elif (filePath.endswith(".md")):
loader = UnstructuredMarkdownLoader(filePath)
elif (filePath.endswith(".html")):
loader = UnstructuredHTMLLoader(filePath)
elif (filePath.endswith(".png") or filePath.endswith(".jpg")) or filePath.endswith(".jpeg"):
loader = UnstructuredImageLoader(filePath)
elif (filePath.endswith(".xlsx")):
loader = UnstructuredExcelLoader(filePath)
elif (filePath.endswith(".csv")):
loader = CSVLoader(filePath)
elif (filePath.endswith(".xml")):
loader = UnstructuredXMLLoader(filePath)
elif (filePath.endswith(".rtf")):
loader = UnstructuredRTFLoader(filePath)
elif (filePath.endswith(".epub")):
loader = UnstructuredEPubLoader(filePath)
else:
loader = UnstructuredLoader(filePath)
docs = loader.load()
# print(docs[0].metadata)
return docs
def split_docs(docs: List[Document]) -> List[Document]:
# Load example document
text_splitter = ChineseRecursiveTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len,
is_separator_regex=False,
)
texts = text_splitter.split_documents(docs)
# print(texts[0])
return texts
# 定义一个回调函数来处理接收到的消息
def load_and_parse(fileUid: str, filePath: str, kbUid: str) -> List[str]:
logging.info(f"load_and_parse: {filePath}")
# 解析
docs = load_files(filePath)
logging.info(f"Loaded {len(docs)} documents")
# 分块
splited_texts = split_docs(docs)
for doc in splited_texts:
doc.metadata["uid"] = str(uuid.uuid4().hex)
doc.metadata["file_uid"] = fileUid
doc.metadata["kb_uid"] = kbUid
logging.info(f"Split into {len(splited_texts)} chunks")
# 存储到redis
docIds = myredisVector.add_docs(splited_texts)
logging.info(f"Stored in redis")
return docIds
router = APIRouter(
prefix='/docs',
tags=['docs v1 apis']
)
@router.get("/test")
def test():
return {"docs": "test"}