mirror of
https://gitee.com/270580156/weiyu.git
synced 2026-05-14 19:27:53 +00:00
112 lines
4.0 KiB
Python
112 lines
4.0 KiB
Python
'''
|
||
Author: jackning 270580156@qq.com
|
||
Date: 2024-08-29 09:55:30
|
||
LastEditors: jackning 270580156@qq.com
|
||
LastEditTime: 2024-08-31 07:07:31
|
||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||
contact: 270580156@qq.com
|
||
技术/商务联系:270580156@qq.com
|
||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||
'''
|
||
#
|
||
import logging
|
||
import uuid
|
||
from typing import List
|
||
from fastapi import APIRouter
|
||
from app.redisVector import myredisVector
|
||
from langchain_community.document_loaders import (
|
||
PyPDFLoader,
|
||
TextLoader,
|
||
UnstructuredWordDocumentLoader,
|
||
CSVLoader,
|
||
UnstructuredMarkdownLoader,
|
||
UnstructuredEPubLoader,
|
||
UnstructuredHTMLLoader,
|
||
UnstructuredImageLoader,
|
||
UnstructuredExcelLoader,
|
||
UnstructuredXMLLoader,
|
||
UnstructuredRTFLoader
|
||
)
|
||
# from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||
from langchain_unstructured import UnstructuredLoader
|
||
# from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||
from app.textsplitter.chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
|
||
from langchain.docstore.document import Document
|
||
|
||
# https://python.langchain.com/v0.2/docs/integrations/document_loaders/pypdfloader/
|
||
# https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/
|
||
def load_files(filePath: str):
|
||
if (filePath.endswith(".pdf")):
|
||
loader = PyPDFLoader(filePath)
|
||
elif (filePath.endswith(".txt")):
|
||
loader = TextLoader(filePath)
|
||
elif (filePath.endswith(".doc") or filePath.endswith(".docx")):
|
||
loader = UnstructuredWordDocumentLoader(filePath)
|
||
elif (filePath.endswith(".md")):
|
||
loader = UnstructuredMarkdownLoader(filePath)
|
||
elif (filePath.endswith(".html")):
|
||
loader = UnstructuredHTMLLoader(filePath)
|
||
elif (filePath.endswith(".png") or filePath.endswith(".jpg")) or filePath.endswith(".jpeg"):
|
||
loader = UnstructuredImageLoader(filePath)
|
||
elif (filePath.endswith(".xlsx")):
|
||
loader = UnstructuredExcelLoader(filePath)
|
||
elif (filePath.endswith(".csv")):
|
||
loader = CSVLoader(filePath)
|
||
elif (filePath.endswith(".xml")):
|
||
loader = UnstructuredXMLLoader(filePath)
|
||
elif (filePath.endswith(".rtf")):
|
||
loader = UnstructuredRTFLoader(filePath)
|
||
elif (filePath.endswith(".epub")):
|
||
loader = UnstructuredEPubLoader(filePath)
|
||
else:
|
||
loader = UnstructuredLoader(filePath)
|
||
docs = loader.load()
|
||
# print(docs[0].metadata)
|
||
return docs
|
||
|
||
def split_docs(docs: List[Document]) -> List[Document]:
|
||
# Load example document
|
||
text_splitter = ChineseRecursiveTextSplitter(
|
||
chunk_size=500,
|
||
chunk_overlap=50,
|
||
length_function=len,
|
||
is_separator_regex=False,
|
||
)
|
||
texts = text_splitter.split_documents(docs)
|
||
# print(texts[0])
|
||
return texts
|
||
|
||
# 定义一个回调函数来处理接收到的消息
|
||
|
||
|
||
def load_and_parse(fileUid: str, filePath: str, kbUid: str) -> List[str]:
|
||
logging.info(f"load_and_parse: {filePath}")
|
||
# 解析
|
||
docs = load_files(filePath)
|
||
logging.info(f"Loaded {len(docs)} documents")
|
||
# 分块
|
||
splited_texts = split_docs(docs)
|
||
for doc in splited_texts:
|
||
doc.metadata["uid"] = str(uuid.uuid4().hex)
|
||
doc.metadata["file_uid"] = fileUid
|
||
doc.metadata["kb_uid"] = kbUid
|
||
logging.info(f"Split into {len(splited_texts)} chunks")
|
||
# 存储到redis
|
||
docIds = myredisVector.add_docs(splited_texts)
|
||
logging.info(f"Stored in redis")
|
||
return docIds
|
||
|
||
|
||
router = APIRouter(
|
||
prefix='/docs',
|
||
tags=['docs v1 apis']
|
||
)
|
||
|
||
@router.get("/test")
|
||
def test():
|
||
return {"docs": "test"}
|