Sync from bytedesk-private: update

2026-05-14 19:27:53 +00:00 · 2024-12-14 10:43:18 +08:00
parent 476eebb101
commit 5e082909e4
3421 changed files with 812709 additions and 0 deletions
--- a/modules/python/app/init.py
+++ b/modules/python/app/init.py
--- a/modules/python/app/api.py
+++ b/modules/python/app/api.py
@@ -0,0 +1,24 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2024-08-29 10:35:52
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-08-29 10:38:28
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+# 
+from fastapi import APIRouter
+
+from app import chat, config, doc, tts
+#
+api_v1_router = APIRouter()
+api_v1_router.include_router(chat.router)
+api_v1_router.include_router(config.router)
+api_v1_router.include_router(doc.router)
+api_v1_router.include_router(tts.router)
--- a/modules/python/app/chat.py
+++ b/modules/python/app/chat.py
@@ -0,0 +1,47 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2024-08-29 09:55:35
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-08-31 19:19:57
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+# 
+import logging
+from fastapi import APIRouter, Request
+from app.redisVector import myredisVector
+
+router = APIRouter(
+    prefix='/chat',
+    tags=['chat v1 apis']
+)
+
+# http://127.0.0.1:9007/api/v1/chat/query?kbuid=1461090177253570&query=报名条件
+# http://127.0.0.1:9007/api/v1/chat/query?kbuid=1461487033909519&query=DataStructure
+@router.get("/query")
+def query(kbuid: str, query: str):
+    # 测试搜索结果
+    search_results = myredisVector.search_docs(kbUid=kbuid, query=query)
+    # search_results = myredisVector.search_as_retriever(kbUid=kbuid, query=query)
+    return {
+        "results": search_results
+    }
+
+# # http://127.0.0.1:9007/api/v1/chat/stream?kbuid=1461090177253570&query=报名条件
+# # http://127.0.0.1:9007/api/v1/chat/stream?kbuid=1461487033909519&query=DataStructure
+# @router.get("/stream")
+# async def query(kbuid: str, query: str):
+#     logging.info(f'stream: {kbuid}, {query}')
+#     # TODO: query from db/cache, if match then return, if not then goto llm
+#     search_results = myredisVector.search_docs(kbUid=kbuid, query=query)
+#     logging.info(f'搜索结果: count={ len(search_results) }')
+#     await myredisVector.query_llm(messageUid='', threadTopic='', kbUid=kbuid, question=query, search_results=search_results)
+#     return {
+#         'message': 'ok'
+#     }
--- a/modules/python/app/config.py
+++ b/modules/python/app/config.py
@@ -0,0 +1,76 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2023-12-26 11:20:33
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-08-31 19:18:09
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+from functools import lru_cache
+from typing import Annotated
+from fastapi import APIRouter, Depends
+from pydantic_settings import BaseSettings
+
+# https://fastapi.tiangolo.com/zh/advanced/settings/#pydantic-settings 
+class Settings(BaseSettings):
+    DEBUG: bool
+    API_V1_PREFIX: str 
+    # 
+    EMBEDDINGS_PATH: str
+    #
+    ZHIPU_API_KEY: str
+    # 连接MySQL数据库
+    DATABASE_URL: str
+    ASYNC_DATABASE_URL: str
+    # REDIS
+    REDIS_HOST: str
+    REDIS_PORT: int
+    REDIS_PASSWORD: str
+    REDIS_URL: str
+    REDIS_KEY_PREFIX: str
+    REDIS_INDEX_NAME: str
+    IS_VECTOR_STORE_INITIATED: str
+
+    # https://docs.pydantic.dev/latest/api/config/
+    # https://fastapi.tiangolo.com/zh/advanced/settings/#env_1
+    class Config:
+        case_sensitive = False
+        # 配置环境变量文件
+        env_file = ".env"
+# 
+# settings = Settings()
+
+@lru_cache(maxsize=32)
+def get_settings():
+    return Settings()
+
+router = APIRouter(
+    prefix='/settings',
+    tags=['settings v1 apis']
+)
+
+# # http://127.0.0.1:9007/api/v1/settings/info
+# @router.get("/info")
+# async def info(settings: Annotated[Settings, Depends(get_settings)]):
+#     print('cache info: ', get_settings.cache_info(), settings.API_V1_PREFIX)
+#     if (settings.DEBUG):
+#         return {
+#             "env": settings.model_dump()
+#         }
+#     return {
+#         "env": 'None'
+#     }
+
+# # http://127.0.0.1:9007/api/v1/settings/clear_cache
+# # 添加一个清除缓存的路由
+# @router.get("/clear_cache")
+# def clear_cache():
+#     print('cache info: ', get_settings.cache_info())
+#     get_settings.cache_clear()  # 清除get_settings函数的缓存
+#     return {"message": "Cache cleared"}
--- a/modules/python/app/consts.py
+++ b/modules/python/app/consts.py
@@ -0,0 +1,31 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2024-08-29 18:23:40
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-08-30 16:54:39
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+#
+pubsubChannel: str = 'bytedeskim:pubsub'
+# FIXME: 需要统一java服务器编解码 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xac in position 0: invalid start byte
+# pubsubObjectChannel: str = 'bytedeskim:pubsub_object'
+# 
+PARSE_FILE: str = 'PARSE_FILE'
+PARSE_FILE_SUCCESS: str = 'PARSE_FILE_SUCCESS'
+PARSE_FILE_ERROR: str = 'PARSE_FILE_ERROR'
+# 
+DELETE_FILE: str = 'DELETE_FILE'
+DELETE_FILE_SUCCESS: str = 'DELETE_FILE_SUCCESS'
+DELETE_FILE_ERROR: str = 'DELETE_FILE_ERROR'
+# 
+QUESTION: str = 'QUESTION'
+ANSWER: str = 'ANSWER'
+ANSWER_FINISHED: str = 'ANSWER_FINISHED'
+
--- a/modules/python/app/doc.py
+++ b/modules/python/app/doc.py
@@ -0,0 +1,111 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2024-08-29 09:55:30
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-08-31 07:07:31
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+# 
+import logging
+import uuid
+from typing import List
+from fastapi import APIRouter
+from app.redisVector import myredisVector
+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    TextLoader,
+    UnstructuredWordDocumentLoader,
+    CSVLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredImageLoader,
+    UnstructuredExcelLoader,
+    UnstructuredXMLLoader,
+    UnstructuredRTFLoader
+)
+# from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
+from langchain_unstructured import UnstructuredLoader
+# from langchain_text_splitters import RecursiveCharacterTextSplitter
+from app.textsplitter.chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
+from langchain.docstore.document import Document
+
+# https://python.langchain.com/v0.2/docs/integrations/document_loaders/pypdfloader/
+# https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/
+def load_files(filePath: str):
+    if (filePath.endswith(".pdf")):
+        loader = PyPDFLoader(filePath)
+    elif (filePath.endswith(".txt")):
+        loader = TextLoader(filePath)
+    elif (filePath.endswith(".doc") or filePath.endswith(".docx")):
+        loader = UnstructuredWordDocumentLoader(filePath)
+    elif (filePath.endswith(".md")):
+        loader = UnstructuredMarkdownLoader(filePath)
+    elif (filePath.endswith(".html")):
+        loader = UnstructuredHTMLLoader(filePath)
+    elif (filePath.endswith(".png") or filePath.endswith(".jpg")) or filePath.endswith(".jpeg"):
+        loader = UnstructuredImageLoader(filePath)
+    elif (filePath.endswith(".xlsx")):
+        loader = UnstructuredExcelLoader(filePath)
+    elif (filePath.endswith(".csv")):
+        loader = CSVLoader(filePath)
+    elif (filePath.endswith(".xml")):
+        loader = UnstructuredXMLLoader(filePath)
+    elif (filePath.endswith(".rtf")):
+        loader = UnstructuredRTFLoader(filePath)
+    elif (filePath.endswith(".epub")):
+        loader = UnstructuredEPubLoader(filePath)
+    else:
+        loader = UnstructuredLoader(filePath)
+    docs = loader.load()
+    # print(docs[0].metadata)
+    return docs
+
+def split_docs(docs: List[Document]) -> List[Document]:
+    # Load example document
+    text_splitter = ChineseRecursiveTextSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    texts = text_splitter.split_documents(docs)
+    # print(texts[0])
+    return texts
+
+# 定义一个回调函数来处理接收到的消息
+
+
+def load_and_parse(fileUid: str, filePath: str, kbUid: str) -> List[str]:
+    logging.info(f"load_and_parse: {filePath}")
+    # 解析
+    docs = load_files(filePath)
+    logging.info(f"Loaded {len(docs)} documents")
+    # 分块
+    splited_texts = split_docs(docs)
+    for doc in splited_texts:
+        doc.metadata["uid"] = str(uuid.uuid4().hex)
+        doc.metadata["file_uid"] = fileUid
+        doc.metadata["kb_uid"] = kbUid
+    logging.info(f"Split into {len(splited_texts)} chunks")
+    # 存储到redis
+    docIds = myredisVector.add_docs(splited_texts)
+    logging.info(f"Stored in redis")
+    return docIds
+
+
+router = APIRouter(
+    prefix='/docs',
+    tags=['docs v1 apis']
+)
+
+@router.get("/test")
+def test():
+    return {"docs": "test"}
--- a/modules/python/app/redis.py
+++ b/modules/python/app/redis.py
@@ -0,0 +1,110 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2024-08-29 09:55:35
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-08-31 11:40:51
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+#
+# import time
+import logging
+from typing import List
+import redis
+import json
+import uuid
+from app.redisPubsub import defaultPublish, publishDeleteFileError, publishDeleteFileSuccess, publishParseFileSuccess
+from app.redisVector import myredisVector
+from app.config import get_settings
+from app.doc import load_and_parse
+from app.utils import download_file
+from app.consts import DELETE_FILE, PARSE_FILE, QUESTION, pubsubChannel
+
+redisClient = redis.Redis(host=get_settings().REDIS_HOST,
+                          password=get_settings().REDIS_PASSWORD,
+                          port=get_settings().REDIS_PORT,
+                          decode_responses=True)
+
+
+def defaultSubscribe() -> None:
+    subscribe(pubsubChannel)
+    # subscribe(pubsubObjectChannel)
+
+def subscribe(channel):
+    logging.info(f'subscribe channel: {channel}')
+    pubsub = redisClient.pubsub()
+    pubsub.subscribe(channel)
+    pubsub.subscribe(**{channel: on_message})
+    pubsub.run_in_thread(0.1)
+# 
+def on_message(message):
+    # {"content":"http://127.0.0.1:9003/file/240828150817_北京软考通知.pdf","type":"UPLOAD_FILE"}
+    # logging.info(f"on_message Received message: {message['data']}")
+    # 解析消息字符串为字典
+    data_dict = json.loads(message['data'])
+    # 获取content和type的内容
+    type_ = data_dict.get('type')
+    content = data_dict.get('content')
+    content_dict = json.loads(content)
+    if type_ == PARSE_FILE:
+        fileUid = content_dict.get('fileUid')
+        fileUrl = content_dict.get('fileUrl')
+        kbUid = content_dict.get('kbUid')
+        # 打印获取到的content和type
+        logging.info(f"Received type: {type_}, {fileUid}, {fileUrl}, {kbUid}")
+        parse_file(fileUid, fileUrl, kbUid)
+    elif type_ == DELETE_FILE:
+        fileUid = content_dict.get('fileUid')
+        docIds = content_dict.get('docIds')
+        logging.info(f"Received type: {type_}, {fileUid}, {docIds}")
+        delete_docs(fileUid=fileUid, docIds=docIds)
+    elif type_ == QUESTION:
+        uid = content_dict.get('uid')
+        threadTopic = content_dict.get('threadTopic')
+        kbUid = content_dict.get('kbUid')
+        question = content_dict.get('question')
+        # 打印获取到的content和type
+        logging.info(f"Received type: {type_}, {threadTopic}, {kbUid}, {question}")
+        search_results = myredisVector.search_docs(kbUid=kbUid, query=question)
+        myredisVector.query_llm(messageUid=uid, threadTopic=threadTopic, kbUid=kbUid, question=question, search_results=search_results)
+    # else:
+        # logging.info(f"Received unknown type: {type_}")
+
+
+def parse_file(fileUid: str, fileUrl: str, kbUid: str):
+    # 下载文件到file文件夹
+    # 如果fileUrl存在，则下载文件
+    if fileUrl:
+        filePath = download_file(fileUrl)
+        logging.info(f"Saved file: {filePath}")
+    # 解析
+    docIds = load_and_parse(fileUid=fileUid, filePath=filePath, kbUid=kbUid)
+    #
+    publishParseFileSuccess(fileUid=fileUid, docIds=docIds)
+    return
+
+def delete_docs(fileUid: str, docIds: List[str]):
+    # logging.info(f"delete fileUid: {fileUid}, {docIds}")
+    result = myredisVector.delete_docs(docIds=docIds)
+    #
+    if result:
+        logging.info(f"delete fileUid success: {result} {docIds}")
+        publishDeleteFileSuccess(fileUid=fileUid)
+    else:
+        logging.info(f"delete fileUid fail: {result} {docIds}")
+        publishDeleteFileError(fileUid=fileUid, errorMsg="delete fail")
+    return
+
+
+def setKey(key: str, value: str):
+    redisClient.set(key, value)
+
+
+def getKey(key: str):
+    redisClient.get(key)
--- a/modules/python/app/redisPubsub.py
+++ b/modules/python/app/redisPubsub.py
@@ -0,0 +1,146 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2024-08-29 18:21:14
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-08-31 10:18:58
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+import json
+import logging
+from typing import List
+import redis
+from app.config import get_settings
+from app.consts import DELETE_FILE_ERROR, DELETE_FILE_SUCCESS, PARSE_FILE_ERROR, PARSE_FILE_SUCCESS, ANSWER, ANSWER_FINISHED, pubsubChannel
+#
+redisClient = redis.Redis(host=get_settings().REDIS_HOST,
+                          password=get_settings().REDIS_PASSWORD,
+                          port=get_settings().REDIS_PORT,
+                          decode_responses=True)
+#
+
+
+def publishParseFileSuccess(fileUid: str, docIds: List[str]) -> None:
+    content = json.dumps({
+        "fileUid": fileUid,
+        "docIds": docIds
+    })
+    message = json.dumps({
+        "type": PARSE_FILE_SUCCESS,
+        "content": content,
+    }, ensure_ascii=False)
+    defaultPublish(content=message)
+
+
+def publishParseFileError(fileUid: str, errorMsg: str) -> None:
+    content = json.dumps({
+        "fileUid": fileUid,
+        "errorMsg": errorMsg,
+    })
+    message = json.dumps({
+        "type": PARSE_FILE_ERROR,
+        "content": content,
+    }, ensure_ascii=False)
+    defaultPublish(content=message)
+
+
+def publishDeleteFileSuccess(fileUid: str) -> None:
+    content = json.dumps({
+        "fileUid": fileUid,
+    })
+    message = json.dumps({
+        "type": DELETE_FILE_SUCCESS,
+        "content": content,
+    }, ensure_ascii=False)
+    defaultPublish(content=message)
+
+
+def publishDeleteFileError(fileUid: str, errorMsg: str) -> None:
+    content = json.dumps({
+        "fileUid": fileUid,
+        "errorMsg": errorMsg,
+    })
+    message = json.dumps({
+        "type": DELETE_FILE_ERROR,
+        "content": content,
+    }, ensure_ascii=False)
+    defaultPublish(content=message)
+
+
+def publishAnswerMessage(
+    id: int,
+    uid: str,
+    threadTopic: str, 
+    kbUid: str, 
+    question: str, 
+    answer: str, 
+    model: str, 
+    created: int) -> None:
+    # 
+    content = json.dumps({
+        "id": id,
+        "uid": uid,
+        "threadTopic": threadTopic,
+        "kbUid": kbUid,
+        "question": question,
+        "answer": answer,
+        "model": model,
+        "created": created
+    })
+    message = json.dumps({
+        "type": ANSWER,
+        "content": content,
+    }, ensure_ascii=False)
+    defaultPublish(content=message)
+    return
+
+
+def publishAnswerFinished(
+    id: int,
+    uid: str,
+    threadTopic: str, 
+    kbUid: str, 
+    question: str, 
+    answer: str, 
+    model: str, 
+    created: int, 
+    promptTokens: str, 
+    completionTokens: str, 
+    totalTokens: str) -> None:
+    # 
+    content = json.dumps({
+        "id": id,
+        "uid": uid,
+        "threadTopic": threadTopic,
+        "kbUid": kbUid,
+        "question": question,
+        "answer": answer,
+        "model": model,
+        "created": created,
+        "promptTokens": promptTokens,
+        "completionTokens": completionTokens,
+        "totalTokens": totalTokens
+    })
+    message = json.dumps({
+        "type": ANSWER_FINISHED,
+        "content": content,
+    }, ensure_ascii=False)
+    defaultPublish(content=message)
+    return
+
+#
+
+
+def defaultPublish(content: str) -> None:
+    publish(pubsubChannel, content)
+
+
+def publish(channel, message):
+    # logging.info(f'publish {message} to channel: {channel}')
+    redisClient.publish(channel, message)
--- a/modules/python/app/redisVector.py
+++ b/modules/python/app/redisVector.py
@@ -0,0 +1,194 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2024-08-29 14:49:54
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-09-03 09:14:22
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+#
+import logging
+from typing import List
+from langchain_redis import RedisConfig, RedisVectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores.redis.filters import RedisFilter
+from langchain.docstore.document import Document
+from redisvl.query.filter import Tag
+from zhipuai import ZhipuAI
+
+from app.config import get_settings
+from app.redisPubsub import publishAnswerMessage, publishAnswerFinished
+
+zhipuAi = ZhipuAI(api_key=get_settings().ZHIPU_API_KEY)  # 填写您自己的APIKey
+
+redis_index_schema = {
+    "tag": [{"name": "kb_uid"}, {"name": "file_uid"}],
+}
+
+# https://python.langchain.com/v0.2/docs/integrations/vectorstores/redis/
+
+class MyRedisVector:
+    #
+    embeddings: HuggingFaceEmbeddings
+    vector_store: RedisVectorStore
+    #
+    def __init__(self):
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name=get_settings().EMBEDDINGS_PATH)
+        #
+        config = RedisConfig(
+            index_name=get_settings().REDIS_INDEX_NAME,
+            key_prefix=get_settings().REDIS_KEY_PREFIX,
+            redis_url=get_settings().REDIS_URL,
+            redis_index_schema=redis_index_schema,
+            metadata_schema=[
+                {
+                    "name": "kb_uid",
+                    "type": "tag",
+                },
+                {
+                    "name": "file_uid",
+                    "type": "tag",
+                }
+            ]
+        )
+        self.vector_store = RedisVectorStore(self.embeddings, config=config)
+        return
+
+    def add_docs(self, docs: List[Document]) -> List[str]:
+        # 将docs中的文档存储到vector_store中
+        results = self.vector_store.add_documents(docs)
+        logging.info(f"add_texts result: {results}")
+        return results
+    
+    def delete_docs(self, docIds: List[str]) -> int:
+        # 将docs从vector_store中删除
+        # 判断docIds是否为空，空则返回0
+        if not docIds:
+            return 0
+        # 返回被删除的文档数量
+        return self.vector_store.index.drop_keys(docIds)
+
+    # FIXME: RuntimeError: Index has not been created. Must be created before calling search
+    def search_docs(self, kbUid: str, query: str) -> List[Document]:
+        kb_filter = Tag("kb_uid") == kbUid
+        results = self.vector_store.similarity_search(
+            query, k=3, filter=kb_filter)
+        # logging.info(f'search_store results {results}, {len(results)}')
+        return results
+
+    # 4.2 使用Retriever在向量库中搜索问题
+    # https://python.langchain.com/docs/integrations/vectorstores/redis#redis-as-retriever
+    def search_as_retriever(self, kbUid: str, query: str) -> List[Document]:
+        # query向量化？langchain文档demo中没有向量化
+        # kb_filter = RedisFilter.tag("kb_uid") == kbUid
+        kb_filter = Tag("kb_uid") == kbUid
+        # 有三种搜索算法
+        # 1 默认算法
+        retriever = self.vector_store.as_retriever(
+            search_type="similarity",
+            search_kwargs={
+                "k": 3,
+                "filter": kb_filter
+            }
+        )
+        # 2 similarity_distance_threshold retriever which allows the user to specify the vector distance
+        # retriever = self.vector_store.as_retriever(
+        #     search_type="similarity_distance_threshold",
+        #     search_kwargs={"k": 3, "distance_threshold": 0.1,
+        #                    "filter": kb_filter},
+        # )
+        # 3 the similarity_score_threshold allows the user to define the minimum score for similar documents
+        # retriever = self.vector_store.as_retriever(
+        #     search_type="similarity_score_threshold",
+        #     search_kwargs={"score_threshold": 0.9,
+        #                    "k": 3, "filter": kb_filter}, 
+        # )
+        #
+        results = retriever.get_relevant_documents(query)
+        print(f'search_as_retriever results {results}, {len(results)}')
+        return results
+
+    # 5.Generation 将问题和搜索结果传给大模型，返回答案
+    def query_llm(self, messageUid: str, threadTopic: str, kbUid: str, question: str, search_results: List[Document]):
+        # logging.info(f'query_llm search_results {search_results}, {len(search_results)}')
+        # 拼接
+        # context = "\n".join(search_results)
+        context = "\n".join([doc.page_content for doc in search_results])
+        # logging.info(f'query_llm context {context}')
+        # 基于本地知识问答的提示词模
+        # TODO: write an english version 
+        # <问题 > {query} < /问题 >
+        prompt_template = f'''
+        ### Human: <指令>根据已知信息，简洁和专业的来回答问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题”，不允许在答案中添加编造成分，答案请使用跟问题中同样的语言。 </指令>
+        <已知信息>{ context }</已知信息>###
+        Assistant:'''
+        print(f'prompt_template: { { prompt_template } }')
+
+        # https://open.bigmodel.cn/dev/api#sdk_example
+        response = zhipuAi.chat.completions.create(
+            model="glm-4-flash", # 免费版
+            # prompt=[{"role": "user", "content": prompt_template}],
+            messages=[
+                {"role": "system", "content": prompt_template},
+                {"role": "user", "content": question},
+            ],
+            top_p=0.7,
+            temperature=0.3,
+            stream=True
+        )
+        #
+        counter = 0
+        for chunk in response:
+            # ChatCompletionChunk(id='202408291708554ae5d8eac6e94630', choices=[Choice(delta=ChoiceDelta(content='凡', role='assistant', tool_calls=None), finish_reason=None, index=0)], created=1724922535, model='glm-4-air', usage=None, extra_json=None)
+            # ChatCompletionChunk(id='202408291708554ae5d8eac6e94630', choices=[Choice(delta=ChoiceDelta(content='', role='assistant', tool_calls=None), finish_reason='stop', index=0)], created=1724922535, model='glm-4-air', usage=CompletionUsage(prompt_tokens=453, completion_tokens=34, total_tokens=487), extra_json=None)
+            # print(f'chunk: { chunk }')
+            # INFO: query_llm chunk: , stop, glm-4-air, CompletionUsage(prompt_tokens=453, completion_tokens=34, total_tokens=487)
+            #
+            counter += 1
+            id_to_publish = counter
+            # 
+            answer = chunk.choices[0].delta.content
+            model = chunk.model
+            created = chunk.created
+            finish_reason = chunk.choices[0].finish_reason
+            logging.info(
+                f'query_llm: {counter} {chunk.choices[0].delta.content}, {chunk.choices[0].finish_reason}, {chunk.model}, {chunk.created}, {chunk.usage}')
+            if finish_reason == 'stop':
+                promptTokens = chunk.usage.prompt_tokens
+                completionTokens = chunk.usage.completion_tokens
+                totalTokens = chunk.usage.total_tokens
+                publishAnswerFinished(
+                    id=id_to_publish,
+                    uid=messageUid,
+                    threadTopic=threadTopic, 
+                    kbUid=kbUid, 
+                    question=question, 
+                    answer=answer,
+                    model=model, 
+                    created=created,
+                    promptTokens=promptTokens, 
+                    completionTokens=completionTokens, 
+                    totalTokens=totalTokens
+                )
+            else:
+                publishAnswerMessage(
+                    id=id_to_publish,
+                    uid=messageUid,
+                    threadTopic=threadTopic, 
+                    kbUid=kbUid, 
+                    question=question, 
+                    answer=answer,
+                    model=model, 
+                    created=created
+                )
+
+
+myredisVector = MyRedisVector()
+#
--- a/modules/python/app/textsplitter/chinese_recursive_text_splitter.py
+++ b/modules/python/app/textsplitter/chinese_recursive_text_splitter.py
@@ -0,0 +1,107 @@
+import logging
+import re
+from typing import Any, List, Optional
+
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+# from chatchat.utils import build_logger
+# logger = build_logger()
+
+def _split_text_with_regex_from_end(
+    text: str, separator: str, keep_separator: bool
+) -> List[str]:
+    # Now that we have the separator, split the text
+    if separator:
+        if keep_separator:
+            # The parentheses in the pattern keep the delimiters in the result.
+            _splits = re.split(f"({separator})", text)
+            splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
+            if len(_splits) % 2 == 1:
+                splits += _splits[-1:]
+            # splits = [_splits[0]] + splits
+        else:
+            splits = re.split(separator, text)
+    else:
+        splits = list(text)
+    return [s for s in splits if s != ""]
+
+
+class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
+    def __init__(
+        self,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True,
+        is_separator_regex: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or [
+            "\n\n",
+            "\n",
+            "。|！|？",
+            "\.\s|\!\s|\?\s",
+            "；|;\s",
+            "，|,\s",
+        ]
+        self._is_separator_regex = is_separator_regex
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1 :]
+                break
+
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = _split_text_with_regex_from_end(text, _separator, self._keep_separator)
+
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self._keep_separator else separator
+        for s in splits:
+            if self._length_function(s) < self._chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return [
+            re.sub(r"\n{2,}", "\n", chunk.strip())
+            for chunk in final_chunks
+            if chunk.strip() != ""
+        ]
+
+
+if __name__ == "__main__":
+    text_splitter = ChineseRecursiveTextSplitter(
+        keep_separator=True, is_separator_regex=True, chunk_size=50, chunk_overlap=0
+    )
+    ls = [
+        """中国对外贸易形势报告（75页）。前 10 个月，一般贸易进出口 19.5 万亿元，增长 25.1%， 比整体进出口增速高出 2.9 个百分点，占进出口总额的 61.7%，较去年同期提升 1.6 个百分点。其中，一般贸易出口 10.6 万亿元，增长 25.3%，占出口总额的 60.9%，提升 1.5 个百分点；进口8.9万亿元，增长24.9%，占进口总额的62.7%， 提升 1.8 个百分点。加工贸易进出口 6.8 万亿元，增长 11.8%， 占进出口总额的 21.5%，减少 2.0 个百分点。其中，出口增 长 10.4%，占出口总额的 24.3%，减少 2.6 个百分点；进口增 长 14.2%，占进口总额的 18.0%，减少 1.2 个百分点。此外， 以保税物流方式进出口 3.96 万亿元，增长 27.9%。其中，出 口 1.47 万亿元，增长 38.9%；进口 2.49 万亿元，增长 22.2%。前三季度，中国服务贸易继续保持快速增长态势。服务 进出口总额 37834.3 亿元，增长 11.6%；其中服务出口 17820.9 亿元，增长 27.3%；进口 20013.4 亿元，增长 0.5%，进口增 速实现了疫情以来的首次转正。服务出口增幅大于进口 26.8 个百分点，带动服务贸易逆差下降 62.9%至 2192.5 亿元。服 务贸易结构持续优化，知识密集型服务进出口 16917.7 亿元， 增长 13.3%，占服务进出口总额的比重达到 44.7%，提升 0.7 个百分点。 二、中国对外贸易发展环境分析和展望 全球疫情起伏反复，经济复苏分化加剧，大宗商品价格 上涨、能源紧缺、运力紧张及发达经济体政策调整外溢等风 险交织叠加。同时也要看到，我国经济长期向好的趋势没有 改变，外贸企业韧性和活力不断增强，新业态新模式加快发 展，创新转型步伐提速。产业链供应链面临挑战。美欧等加快出台制造业回迁计 划，加速产业链供应链本土布局，跨国公司调整产业链供应 链，全球双链面临新一轮重构，区域化、近岸化、本土化、 短链化趋势凸显。疫苗供应不足，制造业“缺芯”、物流受限、 运价高企，全球产业链供应链面临压力。 全球通胀持续高位运行。能源价格上涨加大主要经济体 的通胀压力，增加全球经济复苏的不确定性。世界银行今年 10 月发布《大宗商品市场展望》指出，能源价格在 2021 年 大涨逾 80%，并且仍将在 2022 年小幅上涨。IMF 指出，全 球通胀上行风险加剧，通胀前景存在巨大不确定性。""",
+    ]
+    # text = """"""
+    for inum, text in enumerate(ls):
+        print(inum)
+        chunks = text_splitter.split_text(text)
+        for chunk in chunks:
+            print(chunk)
--- a/modules/python/app/tts.py
+++ b/modules/python/app/tts.py
@@ -0,0 +1,26 @@
+#
+#
+import logging
+from fastapi import APIRouter, Request
+# from sse_starlette import EventSourceResponse
+import edge_tts
+
+# https://github.com/rany2/edge-tts/blob/master/README.md
+# https://tts.byylook.com/ai/text-to-speech?source=github
+router = APIRouter(
+    prefix='/tts',
+    tags=['tts v1 apis']
+)
+
+#
+# https://github.com/rany2/edge-tts/blob/master/examples/basic_generation.py
+# # http://127.0.0.1:9007/api/v1/tts/test
+# 列出音色：edge-tts --list-voices
+@router.get("/test")
+async def tts():
+    TEXT = "Hello World!"
+    VOICE = "en-GB-SoniaNeural"
+    OUTPUT_FILE = "test.mp3"
+    communicate = edge_tts.Communicate(TEXT, VOICE)
+    await communicate.save(OUTPUT_FILE)
+    return "ok"
--- a/modules/python/app/utils.py
+++ b/modules/python/app/utils.py
@@ -0,0 +1,49 @@
+'''
+Author: jackning 270580156@qq.com
+Date: 2024-08-29 09:55:35
+LastEditors: jackning 270580156@qq.com
+LastEditTime: 2024-08-29 15:10:19
+Description: bytedesk.com https://github.com/Bytedesk/bytedesk
+  Please be aware of the BSL license restrictions before installing Bytedesk IM – 
+ selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license. 
+ 仅支持企业内部员工自用，严禁私自用于销售、二次销售或者部署SaaS方式销售 
+ Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE 
+ contact: 270580156@qq.com 
+ 技术/商务联系：270580156@qq.com
+Copyright (c) 2024 by bytedesk.com, All Rights Reserved. 
+'''
+# 
+import os
+import requests
+
+def download_file(file_url: str, destination_folder: str = 'files') -> str | None:
+    """
+    下载文件到指定文件夹。
+    
+    :param file_url: 要下载的文件的URL。
+    :param destination_folder: 保存文件的文件夹名称，默认为'file'。
+    """
+    # 确保目标文件夹存在，如果不存在则创建
+    if not os.path.exists(destination_folder):
+        os.makedirs(destination_folder)
+
+    # 从URL中获取文件名
+    file_name = file_url.split('/')[-1]
+
+    # 构建完整的文件保存路径
+    file_path = os.path.join(destination_folder, file_name)
+
+    # 使用requests库下载文件
+    with requests.get(file_url, stream=True) as response:
+        if response.status_code == 200:
+            # 以二进制写入模式打开文件
+            with open(file_path, 'wb') as file:
+                # 分块写入文件内容
+                for chunk in response.iter_content(chunk_size=8192):
+                    file.write(chunk)
+            print(f"文件已成功下载到: {file_path}")
+        else:
+            print(f"下载失败，状态码: {response.status_code}")
+
+    # 返回文件保存路径
+    return file_path