mirror of
https://gitee.com/270580156/weiyu.git
synced 2026-05-14 19:27:53 +00:00
Sync from bytedesk-private: update
This commit is contained in:
0
modules/python/app/__init__.py
Normal file
0
modules/python/app/__init__.py
Normal file
24
modules/python/app/api.py
Normal file
24
modules/python/app/api.py
Normal file
@@ -0,0 +1,24 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2024-08-29 10:35:52
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-08-29 10:38:28
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
#
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app import chat, config, doc, tts
|
||||
#
|
||||
api_v1_router = APIRouter()
|
||||
api_v1_router.include_router(chat.router)
|
||||
api_v1_router.include_router(config.router)
|
||||
api_v1_router.include_router(doc.router)
|
||||
api_v1_router.include_router(tts.router)
|
||||
47
modules/python/app/chat.py
Normal file
47
modules/python/app/chat.py
Normal file
@@ -0,0 +1,47 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2024-08-29 09:55:35
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-08-31 19:19:57
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
#
|
||||
import logging
|
||||
from fastapi import APIRouter, Request
|
||||
from app.redisVector import myredisVector
|
||||
|
||||
router = APIRouter(
|
||||
prefix='/chat',
|
||||
tags=['chat v1 apis']
|
||||
)
|
||||
|
||||
# http://127.0.0.1:9007/api/v1/chat/query?kbuid=1461090177253570&query=报名条件
|
||||
# http://127.0.0.1:9007/api/v1/chat/query?kbuid=1461487033909519&query=DataStructure
|
||||
@router.get("/query")
|
||||
def query(kbuid: str, query: str):
|
||||
# 测试搜索结果
|
||||
search_results = myredisVector.search_docs(kbUid=kbuid, query=query)
|
||||
# search_results = myredisVector.search_as_retriever(kbUid=kbuid, query=query)
|
||||
return {
|
||||
"results": search_results
|
||||
}
|
||||
|
||||
# # http://127.0.0.1:9007/api/v1/chat/stream?kbuid=1461090177253570&query=报名条件
|
||||
# # http://127.0.0.1:9007/api/v1/chat/stream?kbuid=1461487033909519&query=DataStructure
|
||||
# @router.get("/stream")
|
||||
# async def query(kbuid: str, query: str):
|
||||
# logging.info(f'stream: {kbuid}, {query}')
|
||||
# # TODO: query from db/cache, if match then return, if not then goto llm
|
||||
# search_results = myredisVector.search_docs(kbUid=kbuid, query=query)
|
||||
# logging.info(f'搜索结果: count={ len(search_results) }')
|
||||
# await myredisVector.query_llm(messageUid='', threadTopic='', kbUid=kbuid, question=query, search_results=search_results)
|
||||
# return {
|
||||
# 'message': 'ok'
|
||||
# }
|
||||
76
modules/python/app/config.py
Normal file
76
modules/python/app/config.py
Normal file
@@ -0,0 +1,76 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2023-12-26 11:20:33
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-08-31 19:18:09
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
from functools import lru_cache
|
||||
from typing import Annotated
|
||||
from fastapi import APIRouter, Depends
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
# https://fastapi.tiangolo.com/zh/advanced/settings/#pydantic-settings
|
||||
class Settings(BaseSettings):
|
||||
DEBUG: bool
|
||||
API_V1_PREFIX: str
|
||||
#
|
||||
EMBEDDINGS_PATH: str
|
||||
#
|
||||
ZHIPU_API_KEY: str
|
||||
# 连接MySQL数据库
|
||||
DATABASE_URL: str
|
||||
ASYNC_DATABASE_URL: str
|
||||
# REDIS
|
||||
REDIS_HOST: str
|
||||
REDIS_PORT: int
|
||||
REDIS_PASSWORD: str
|
||||
REDIS_URL: str
|
||||
REDIS_KEY_PREFIX: str
|
||||
REDIS_INDEX_NAME: str
|
||||
IS_VECTOR_STORE_INITIATED: str
|
||||
|
||||
# https://docs.pydantic.dev/latest/api/config/
|
||||
# https://fastapi.tiangolo.com/zh/advanced/settings/#env_1
|
||||
class Config:
|
||||
case_sensitive = False
|
||||
# 配置环境变量文件
|
||||
env_file = ".env"
|
||||
#
|
||||
# settings = Settings()
|
||||
|
||||
@lru_cache(maxsize=32)
|
||||
def get_settings():
|
||||
return Settings()
|
||||
|
||||
router = APIRouter(
|
||||
prefix='/settings',
|
||||
tags=['settings v1 apis']
|
||||
)
|
||||
|
||||
# # http://127.0.0.1:9007/api/v1/settings/info
|
||||
# @router.get("/info")
|
||||
# async def info(settings: Annotated[Settings, Depends(get_settings)]):
|
||||
# print('cache info: ', get_settings.cache_info(), settings.API_V1_PREFIX)
|
||||
# if (settings.DEBUG):
|
||||
# return {
|
||||
# "env": settings.model_dump()
|
||||
# }
|
||||
# return {
|
||||
# "env": 'None'
|
||||
# }
|
||||
|
||||
# # http://127.0.0.1:9007/api/v1/settings/clear_cache
|
||||
# # 添加一个清除缓存的路由
|
||||
# @router.get("/clear_cache")
|
||||
# def clear_cache():
|
||||
# print('cache info: ', get_settings.cache_info())
|
||||
# get_settings.cache_clear() # 清除get_settings函数的缓存
|
||||
# return {"message": "Cache cleared"}
|
||||
31
modules/python/app/consts.py
Normal file
31
modules/python/app/consts.py
Normal file
@@ -0,0 +1,31 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2024-08-29 18:23:40
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-08-30 16:54:39
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
#
|
||||
pubsubChannel: str = 'bytedeskim:pubsub'
|
||||
# FIXME: 需要统一java服务器编解码 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xac in position 0: invalid start byte
|
||||
# pubsubObjectChannel: str = 'bytedeskim:pubsub_object'
|
||||
#
|
||||
PARSE_FILE: str = 'PARSE_FILE'
|
||||
PARSE_FILE_SUCCESS: str = 'PARSE_FILE_SUCCESS'
|
||||
PARSE_FILE_ERROR: str = 'PARSE_FILE_ERROR'
|
||||
#
|
||||
DELETE_FILE: str = 'DELETE_FILE'
|
||||
DELETE_FILE_SUCCESS: str = 'DELETE_FILE_SUCCESS'
|
||||
DELETE_FILE_ERROR: str = 'DELETE_FILE_ERROR'
|
||||
#
|
||||
QUESTION: str = 'QUESTION'
|
||||
ANSWER: str = 'ANSWER'
|
||||
ANSWER_FINISHED: str = 'ANSWER_FINISHED'
|
||||
|
||||
111
modules/python/app/doc.py
Normal file
111
modules/python/app/doc.py
Normal file
@@ -0,0 +1,111 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2024-08-29 09:55:30
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-08-31 07:07:31
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
#
|
||||
import logging
|
||||
import uuid
|
||||
from typing import List
|
||||
from fastapi import APIRouter
|
||||
from app.redisVector import myredisVector
|
||||
from langchain_community.document_loaders import (
|
||||
PyPDFLoader,
|
||||
TextLoader,
|
||||
UnstructuredWordDocumentLoader,
|
||||
CSVLoader,
|
||||
UnstructuredMarkdownLoader,
|
||||
UnstructuredEPubLoader,
|
||||
UnstructuredHTMLLoader,
|
||||
UnstructuredImageLoader,
|
||||
UnstructuredExcelLoader,
|
||||
UnstructuredXMLLoader,
|
||||
UnstructuredRTFLoader
|
||||
)
|
||||
# from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
# from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from app.textsplitter.chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
|
||||
from langchain.docstore.document import Document
|
||||
|
||||
# https://python.langchain.com/v0.2/docs/integrations/document_loaders/pypdfloader/
|
||||
# https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/
|
||||
def load_files(filePath: str):
|
||||
if (filePath.endswith(".pdf")):
|
||||
loader = PyPDFLoader(filePath)
|
||||
elif (filePath.endswith(".txt")):
|
||||
loader = TextLoader(filePath)
|
||||
elif (filePath.endswith(".doc") or filePath.endswith(".docx")):
|
||||
loader = UnstructuredWordDocumentLoader(filePath)
|
||||
elif (filePath.endswith(".md")):
|
||||
loader = UnstructuredMarkdownLoader(filePath)
|
||||
elif (filePath.endswith(".html")):
|
||||
loader = UnstructuredHTMLLoader(filePath)
|
||||
elif (filePath.endswith(".png") or filePath.endswith(".jpg")) or filePath.endswith(".jpeg"):
|
||||
loader = UnstructuredImageLoader(filePath)
|
||||
elif (filePath.endswith(".xlsx")):
|
||||
loader = UnstructuredExcelLoader(filePath)
|
||||
elif (filePath.endswith(".csv")):
|
||||
loader = CSVLoader(filePath)
|
||||
elif (filePath.endswith(".xml")):
|
||||
loader = UnstructuredXMLLoader(filePath)
|
||||
elif (filePath.endswith(".rtf")):
|
||||
loader = UnstructuredRTFLoader(filePath)
|
||||
elif (filePath.endswith(".epub")):
|
||||
loader = UnstructuredEPubLoader(filePath)
|
||||
else:
|
||||
loader = UnstructuredLoader(filePath)
|
||||
docs = loader.load()
|
||||
# print(docs[0].metadata)
|
||||
return docs
|
||||
|
||||
def split_docs(docs: List[Document]) -> List[Document]:
|
||||
# Load example document
|
||||
text_splitter = ChineseRecursiveTextSplitter(
|
||||
chunk_size=500,
|
||||
chunk_overlap=50,
|
||||
length_function=len,
|
||||
is_separator_regex=False,
|
||||
)
|
||||
texts = text_splitter.split_documents(docs)
|
||||
# print(texts[0])
|
||||
return texts
|
||||
|
||||
# 定义一个回调函数来处理接收到的消息
|
||||
|
||||
|
||||
def load_and_parse(fileUid: str, filePath: str, kbUid: str) -> List[str]:
|
||||
logging.info(f"load_and_parse: {filePath}")
|
||||
# 解析
|
||||
docs = load_files(filePath)
|
||||
logging.info(f"Loaded {len(docs)} documents")
|
||||
# 分块
|
||||
splited_texts = split_docs(docs)
|
||||
for doc in splited_texts:
|
||||
doc.metadata["uid"] = str(uuid.uuid4().hex)
|
||||
doc.metadata["file_uid"] = fileUid
|
||||
doc.metadata["kb_uid"] = kbUid
|
||||
logging.info(f"Split into {len(splited_texts)} chunks")
|
||||
# 存储到redis
|
||||
docIds = myredisVector.add_docs(splited_texts)
|
||||
logging.info(f"Stored in redis")
|
||||
return docIds
|
||||
|
||||
|
||||
router = APIRouter(
|
||||
prefix='/docs',
|
||||
tags=['docs v1 apis']
|
||||
)
|
||||
|
||||
@router.get("/test")
|
||||
def test():
|
||||
return {"docs": "test"}
|
||||
110
modules/python/app/redis.py
Normal file
110
modules/python/app/redis.py
Normal file
@@ -0,0 +1,110 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2024-08-29 09:55:35
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-08-31 11:40:51
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
#
|
||||
# import time
|
||||
import logging
|
||||
from typing import List
|
||||
import redis
|
||||
import json
|
||||
import uuid
|
||||
from app.redisPubsub import defaultPublish, publishDeleteFileError, publishDeleteFileSuccess, publishParseFileSuccess
|
||||
from app.redisVector import myredisVector
|
||||
from app.config import get_settings
|
||||
from app.doc import load_and_parse
|
||||
from app.utils import download_file
|
||||
from app.consts import DELETE_FILE, PARSE_FILE, QUESTION, pubsubChannel
|
||||
|
||||
redisClient = redis.Redis(host=get_settings().REDIS_HOST,
|
||||
password=get_settings().REDIS_PASSWORD,
|
||||
port=get_settings().REDIS_PORT,
|
||||
decode_responses=True)
|
||||
|
||||
|
||||
def defaultSubscribe() -> None:
|
||||
subscribe(pubsubChannel)
|
||||
# subscribe(pubsubObjectChannel)
|
||||
|
||||
def subscribe(channel):
|
||||
logging.info(f'subscribe channel: {channel}')
|
||||
pubsub = redisClient.pubsub()
|
||||
pubsub.subscribe(channel)
|
||||
pubsub.subscribe(**{channel: on_message})
|
||||
pubsub.run_in_thread(0.1)
|
||||
#
|
||||
def on_message(message):
|
||||
# {"content":"http://127.0.0.1:9003/file/240828150817_北京软考通知.pdf","type":"UPLOAD_FILE"}
|
||||
# logging.info(f"on_message Received message: {message['data']}")
|
||||
# 解析消息字符串为字典
|
||||
data_dict = json.loads(message['data'])
|
||||
# 获取content和type的内容
|
||||
type_ = data_dict.get('type')
|
||||
content = data_dict.get('content')
|
||||
content_dict = json.loads(content)
|
||||
if type_ == PARSE_FILE:
|
||||
fileUid = content_dict.get('fileUid')
|
||||
fileUrl = content_dict.get('fileUrl')
|
||||
kbUid = content_dict.get('kbUid')
|
||||
# 打印获取到的content和type
|
||||
logging.info(f"Received type: {type_}, {fileUid}, {fileUrl}, {kbUid}")
|
||||
parse_file(fileUid, fileUrl, kbUid)
|
||||
elif type_ == DELETE_FILE:
|
||||
fileUid = content_dict.get('fileUid')
|
||||
docIds = content_dict.get('docIds')
|
||||
logging.info(f"Received type: {type_}, {fileUid}, {docIds}")
|
||||
delete_docs(fileUid=fileUid, docIds=docIds)
|
||||
elif type_ == QUESTION:
|
||||
uid = content_dict.get('uid')
|
||||
threadTopic = content_dict.get('threadTopic')
|
||||
kbUid = content_dict.get('kbUid')
|
||||
question = content_dict.get('question')
|
||||
# 打印获取到的content和type
|
||||
logging.info(f"Received type: {type_}, {threadTopic}, {kbUid}, {question}")
|
||||
search_results = myredisVector.search_docs(kbUid=kbUid, query=question)
|
||||
myredisVector.query_llm(messageUid=uid, threadTopic=threadTopic, kbUid=kbUid, question=question, search_results=search_results)
|
||||
# else:
|
||||
# logging.info(f"Received unknown type: {type_}")
|
||||
|
||||
|
||||
def parse_file(fileUid: str, fileUrl: str, kbUid: str):
|
||||
# 下载文件到file文件夹
|
||||
# 如果fileUrl存在,则下载文件
|
||||
if fileUrl:
|
||||
filePath = download_file(fileUrl)
|
||||
logging.info(f"Saved file: {filePath}")
|
||||
# 解析
|
||||
docIds = load_and_parse(fileUid=fileUid, filePath=filePath, kbUid=kbUid)
|
||||
#
|
||||
publishParseFileSuccess(fileUid=fileUid, docIds=docIds)
|
||||
return
|
||||
|
||||
def delete_docs(fileUid: str, docIds: List[str]):
|
||||
# logging.info(f"delete fileUid: {fileUid}, {docIds}")
|
||||
result = myredisVector.delete_docs(docIds=docIds)
|
||||
#
|
||||
if result:
|
||||
logging.info(f"delete fileUid success: {result} {docIds}")
|
||||
publishDeleteFileSuccess(fileUid=fileUid)
|
||||
else:
|
||||
logging.info(f"delete fileUid fail: {result} {docIds}")
|
||||
publishDeleteFileError(fileUid=fileUid, errorMsg="delete fail")
|
||||
return
|
||||
|
||||
|
||||
def setKey(key: str, value: str):
|
||||
redisClient.set(key, value)
|
||||
|
||||
|
||||
def getKey(key: str):
|
||||
redisClient.get(key)
|
||||
146
modules/python/app/redisPubsub.py
Normal file
146
modules/python/app/redisPubsub.py
Normal file
@@ -0,0 +1,146 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2024-08-29 18:21:14
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-08-31 10:18:58
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
import json
|
||||
import logging
|
||||
from typing import List
|
||||
import redis
|
||||
from app.config import get_settings
|
||||
from app.consts import DELETE_FILE_ERROR, DELETE_FILE_SUCCESS, PARSE_FILE_ERROR, PARSE_FILE_SUCCESS, ANSWER, ANSWER_FINISHED, pubsubChannel
|
||||
#
|
||||
redisClient = redis.Redis(host=get_settings().REDIS_HOST,
|
||||
password=get_settings().REDIS_PASSWORD,
|
||||
port=get_settings().REDIS_PORT,
|
||||
decode_responses=True)
|
||||
#
|
||||
|
||||
|
||||
def publishParseFileSuccess(fileUid: str, docIds: List[str]) -> None:
|
||||
content = json.dumps({
|
||||
"fileUid": fileUid,
|
||||
"docIds": docIds
|
||||
})
|
||||
message = json.dumps({
|
||||
"type": PARSE_FILE_SUCCESS,
|
||||
"content": content,
|
||||
}, ensure_ascii=False)
|
||||
defaultPublish(content=message)
|
||||
|
||||
|
||||
def publishParseFileError(fileUid: str, errorMsg: str) -> None:
|
||||
content = json.dumps({
|
||||
"fileUid": fileUid,
|
||||
"errorMsg": errorMsg,
|
||||
})
|
||||
message = json.dumps({
|
||||
"type": PARSE_FILE_ERROR,
|
||||
"content": content,
|
||||
}, ensure_ascii=False)
|
||||
defaultPublish(content=message)
|
||||
|
||||
|
||||
def publishDeleteFileSuccess(fileUid: str) -> None:
|
||||
content = json.dumps({
|
||||
"fileUid": fileUid,
|
||||
})
|
||||
message = json.dumps({
|
||||
"type": DELETE_FILE_SUCCESS,
|
||||
"content": content,
|
||||
}, ensure_ascii=False)
|
||||
defaultPublish(content=message)
|
||||
|
||||
|
||||
def publishDeleteFileError(fileUid: str, errorMsg: str) -> None:
|
||||
content = json.dumps({
|
||||
"fileUid": fileUid,
|
||||
"errorMsg": errorMsg,
|
||||
})
|
||||
message = json.dumps({
|
||||
"type": DELETE_FILE_ERROR,
|
||||
"content": content,
|
||||
}, ensure_ascii=False)
|
||||
defaultPublish(content=message)
|
||||
|
||||
|
||||
def publishAnswerMessage(
|
||||
id: int,
|
||||
uid: str,
|
||||
threadTopic: str,
|
||||
kbUid: str,
|
||||
question: str,
|
||||
answer: str,
|
||||
model: str,
|
||||
created: int) -> None:
|
||||
#
|
||||
content = json.dumps({
|
||||
"id": id,
|
||||
"uid": uid,
|
||||
"threadTopic": threadTopic,
|
||||
"kbUid": kbUid,
|
||||
"question": question,
|
||||
"answer": answer,
|
||||
"model": model,
|
||||
"created": created
|
||||
})
|
||||
message = json.dumps({
|
||||
"type": ANSWER,
|
||||
"content": content,
|
||||
}, ensure_ascii=False)
|
||||
defaultPublish(content=message)
|
||||
return
|
||||
|
||||
|
||||
def publishAnswerFinished(
|
||||
id: int,
|
||||
uid: str,
|
||||
threadTopic: str,
|
||||
kbUid: str,
|
||||
question: str,
|
||||
answer: str,
|
||||
model: str,
|
||||
created: int,
|
||||
promptTokens: str,
|
||||
completionTokens: str,
|
||||
totalTokens: str) -> None:
|
||||
#
|
||||
content = json.dumps({
|
||||
"id": id,
|
||||
"uid": uid,
|
||||
"threadTopic": threadTopic,
|
||||
"kbUid": kbUid,
|
||||
"question": question,
|
||||
"answer": answer,
|
||||
"model": model,
|
||||
"created": created,
|
||||
"promptTokens": promptTokens,
|
||||
"completionTokens": completionTokens,
|
||||
"totalTokens": totalTokens
|
||||
})
|
||||
message = json.dumps({
|
||||
"type": ANSWER_FINISHED,
|
||||
"content": content,
|
||||
}, ensure_ascii=False)
|
||||
defaultPublish(content=message)
|
||||
return
|
||||
|
||||
#
|
||||
|
||||
|
||||
def defaultPublish(content: str) -> None:
|
||||
publish(pubsubChannel, content)
|
||||
|
||||
|
||||
def publish(channel, message):
|
||||
# logging.info(f'publish {message} to channel: {channel}')
|
||||
redisClient.publish(channel, message)
|
||||
194
modules/python/app/redisVector.py
Normal file
194
modules/python/app/redisVector.py
Normal file
@@ -0,0 +1,194 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2024-08-29 14:49:54
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-09-03 09:14:22
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
#
|
||||
import logging
|
||||
from typing import List
|
||||
from langchain_redis import RedisConfig, RedisVectorStore
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
from langchain_community.vectorstores.redis.filters import RedisFilter
|
||||
from langchain.docstore.document import Document
|
||||
from redisvl.query.filter import Tag
|
||||
from zhipuai import ZhipuAI
|
||||
|
||||
from app.config import get_settings
|
||||
from app.redisPubsub import publishAnswerMessage, publishAnswerFinished
|
||||
|
||||
zhipuAi = ZhipuAI(api_key=get_settings().ZHIPU_API_KEY) # 填写您自己的APIKey
|
||||
|
||||
redis_index_schema = {
|
||||
"tag": [{"name": "kb_uid"}, {"name": "file_uid"}],
|
||||
}
|
||||
|
||||
# https://python.langchain.com/v0.2/docs/integrations/vectorstores/redis/
|
||||
|
||||
class MyRedisVector:
|
||||
#
|
||||
embeddings: HuggingFaceEmbeddings
|
||||
vector_store: RedisVectorStore
|
||||
#
|
||||
def __init__(self):
|
||||
self.embeddings = HuggingFaceEmbeddings(
|
||||
model_name=get_settings().EMBEDDINGS_PATH)
|
||||
#
|
||||
config = RedisConfig(
|
||||
index_name=get_settings().REDIS_INDEX_NAME,
|
||||
key_prefix=get_settings().REDIS_KEY_PREFIX,
|
||||
redis_url=get_settings().REDIS_URL,
|
||||
redis_index_schema=redis_index_schema,
|
||||
metadata_schema=[
|
||||
{
|
||||
"name": "kb_uid",
|
||||
"type": "tag",
|
||||
},
|
||||
{
|
||||
"name": "file_uid",
|
||||
"type": "tag",
|
||||
}
|
||||
]
|
||||
)
|
||||
self.vector_store = RedisVectorStore(self.embeddings, config=config)
|
||||
return
|
||||
|
||||
def add_docs(self, docs: List[Document]) -> List[str]:
|
||||
# 将docs中的文档存储到vector_store中
|
||||
results = self.vector_store.add_documents(docs)
|
||||
logging.info(f"add_texts result: {results}")
|
||||
return results
|
||||
|
||||
def delete_docs(self, docIds: List[str]) -> int:
|
||||
# 将docs从vector_store中删除
|
||||
# 判断docIds是否为空,空则返回0
|
||||
if not docIds:
|
||||
return 0
|
||||
# 返回被删除的文档数量
|
||||
return self.vector_store.index.drop_keys(docIds)
|
||||
|
||||
# FIXME: RuntimeError: Index has not been created. Must be created before calling search
|
||||
def search_docs(self, kbUid: str, query: str) -> List[Document]:
|
||||
kb_filter = Tag("kb_uid") == kbUid
|
||||
results = self.vector_store.similarity_search(
|
||||
query, k=3, filter=kb_filter)
|
||||
# logging.info(f'search_store results {results}, {len(results)}')
|
||||
return results
|
||||
|
||||
# 4.2 使用Retriever在向量库中搜索问题
|
||||
# https://python.langchain.com/docs/integrations/vectorstores/redis#redis-as-retriever
|
||||
def search_as_retriever(self, kbUid: str, query: str) -> List[Document]:
|
||||
# query向量化?langchain文档demo中没有向量化
|
||||
# kb_filter = RedisFilter.tag("kb_uid") == kbUid
|
||||
kb_filter = Tag("kb_uid") == kbUid
|
||||
# 有三种搜索算法
|
||||
# 1 默认算法
|
||||
retriever = self.vector_store.as_retriever(
|
||||
search_type="similarity",
|
||||
search_kwargs={
|
||||
"k": 3,
|
||||
"filter": kb_filter
|
||||
}
|
||||
)
|
||||
# 2 similarity_distance_threshold retriever which allows the user to specify the vector distance
|
||||
# retriever = self.vector_store.as_retriever(
|
||||
# search_type="similarity_distance_threshold",
|
||||
# search_kwargs={"k": 3, "distance_threshold": 0.1,
|
||||
# "filter": kb_filter},
|
||||
# )
|
||||
# 3 the similarity_score_threshold allows the user to define the minimum score for similar documents
|
||||
# retriever = self.vector_store.as_retriever(
|
||||
# search_type="similarity_score_threshold",
|
||||
# search_kwargs={"score_threshold": 0.9,
|
||||
# "k": 3, "filter": kb_filter},
|
||||
# )
|
||||
#
|
||||
results = retriever.get_relevant_documents(query)
|
||||
print(f'search_as_retriever results {results}, {len(results)}')
|
||||
return results
|
||||
|
||||
# 5.Generation 将问题和搜索结果传给大模型,返回答案
|
||||
def query_llm(self, messageUid: str, threadTopic: str, kbUid: str, question: str, search_results: List[Document]):
|
||||
# logging.info(f'query_llm search_results {search_results}, {len(search_results)}')
|
||||
# 拼接
|
||||
# context = "\n".join(search_results)
|
||||
context = "\n".join([doc.page_content for doc in search_results])
|
||||
# logging.info(f'query_llm context {context}')
|
||||
# 基于本地知识问答的提示词模
|
||||
# TODO: write an english version
|
||||
# <问题 > {query} < /问题 >
|
||||
prompt_template = f'''
|
||||
### Human: <指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,不允许在答案中添加编造成分,答案请使用跟问题中同样的语言。 </指令>
|
||||
<已知信息>{ context }</已知信息>###
|
||||
Assistant:'''
|
||||
print(f'prompt_template: { { prompt_template } }')
|
||||
|
||||
# https://open.bigmodel.cn/dev/api#sdk_example
|
||||
response = zhipuAi.chat.completions.create(
|
||||
model="glm-4-flash", # 免费版
|
||||
# prompt=[{"role": "user", "content": prompt_template}],
|
||||
messages=[
|
||||
{"role": "system", "content": prompt_template},
|
||||
{"role": "user", "content": question},
|
||||
],
|
||||
top_p=0.7,
|
||||
temperature=0.3,
|
||||
stream=True
|
||||
)
|
||||
#
|
||||
counter = 0
|
||||
for chunk in response:
|
||||
# ChatCompletionChunk(id='202408291708554ae5d8eac6e94630', choices=[Choice(delta=ChoiceDelta(content='凡', role='assistant', tool_calls=None), finish_reason=None, index=0)], created=1724922535, model='glm-4-air', usage=None, extra_json=None)
|
||||
# ChatCompletionChunk(id='202408291708554ae5d8eac6e94630', choices=[Choice(delta=ChoiceDelta(content='', role='assistant', tool_calls=None), finish_reason='stop', index=0)], created=1724922535, model='glm-4-air', usage=CompletionUsage(prompt_tokens=453, completion_tokens=34, total_tokens=487), extra_json=None)
|
||||
# print(f'chunk: { chunk }')
|
||||
# INFO: query_llm chunk: , stop, glm-4-air, CompletionUsage(prompt_tokens=453, completion_tokens=34, total_tokens=487)
|
||||
#
|
||||
counter += 1
|
||||
id_to_publish = counter
|
||||
#
|
||||
answer = chunk.choices[0].delta.content
|
||||
model = chunk.model
|
||||
created = chunk.created
|
||||
finish_reason = chunk.choices[0].finish_reason
|
||||
logging.info(
|
||||
f'query_llm: {counter} {chunk.choices[0].delta.content}, {chunk.choices[0].finish_reason}, {chunk.model}, {chunk.created}, {chunk.usage}')
|
||||
if finish_reason == 'stop':
|
||||
promptTokens = chunk.usage.prompt_tokens
|
||||
completionTokens = chunk.usage.completion_tokens
|
||||
totalTokens = chunk.usage.total_tokens
|
||||
publishAnswerFinished(
|
||||
id=id_to_publish,
|
||||
uid=messageUid,
|
||||
threadTopic=threadTopic,
|
||||
kbUid=kbUid,
|
||||
question=question,
|
||||
answer=answer,
|
||||
model=model,
|
||||
created=created,
|
||||
promptTokens=promptTokens,
|
||||
completionTokens=completionTokens,
|
||||
totalTokens=totalTokens
|
||||
)
|
||||
else:
|
||||
publishAnswerMessage(
|
||||
id=id_to_publish,
|
||||
uid=messageUid,
|
||||
threadTopic=threadTopic,
|
||||
kbUid=kbUid,
|
||||
question=question,
|
||||
answer=answer,
|
||||
model=model,
|
||||
created=created
|
||||
)
|
||||
|
||||
|
||||
myredisVector = MyRedisVector()
|
||||
#
|
||||
@@ -0,0 +1,107 @@
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, List, Optional
|
||||
|
||||
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
|
||||
# from chatchat.utils import build_logger
|
||||
# logger = build_logger()
|
||||
|
||||
def _split_text_with_regex_from_end(
|
||||
text: str, separator: str, keep_separator: bool
|
||||
) -> List[str]:
|
||||
# Now that we have the separator, split the text
|
||||
if separator:
|
||||
if keep_separator:
|
||||
# The parentheses in the pattern keep the delimiters in the result.
|
||||
_splits = re.split(f"({separator})", text)
|
||||
splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
|
||||
if len(_splits) % 2 == 1:
|
||||
splits += _splits[-1:]
|
||||
# splits = [_splits[0]] + splits
|
||||
else:
|
||||
splits = re.split(separator, text)
|
||||
else:
|
||||
splits = list(text)
|
||||
return [s for s in splits if s != ""]
|
||||
|
||||
|
||||
class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||
def __init__(
|
||||
self,
|
||||
separators: Optional[List[str]] = None,
|
||||
keep_separator: bool = True,
|
||||
is_separator_regex: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create a new TextSplitter."""
|
||||
super().__init__(keep_separator=keep_separator, **kwargs)
|
||||
self._separators = separators or [
|
||||
"\n\n",
|
||||
"\n",
|
||||
"。|!|?",
|
||||
"\.\s|\!\s|\?\s",
|
||||
";|;\s",
|
||||
",|,\s",
|
||||
]
|
||||
self._is_separator_regex = is_separator_regex
|
||||
|
||||
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
final_chunks = []
|
||||
# Get appropriate separator to use
|
||||
separator = separators[-1]
|
||||
new_separators = []
|
||||
for i, _s in enumerate(separators):
|
||||
_separator = _s if self._is_separator_regex else re.escape(_s)
|
||||
if _s == "":
|
||||
separator = _s
|
||||
break
|
||||
if re.search(_separator, text):
|
||||
separator = _s
|
||||
new_separators = separators[i + 1 :]
|
||||
break
|
||||
|
||||
_separator = separator if self._is_separator_regex else re.escape(separator)
|
||||
splits = _split_text_with_regex_from_end(text, _separator, self._keep_separator)
|
||||
|
||||
# Now go merging things, recursively splitting longer texts.
|
||||
_good_splits = []
|
||||
_separator = "" if self._keep_separator else separator
|
||||
for s in splits:
|
||||
if self._length_function(s) < self._chunk_size:
|
||||
_good_splits.append(s)
|
||||
else:
|
||||
if _good_splits:
|
||||
merged_text = self._merge_splits(_good_splits, _separator)
|
||||
final_chunks.extend(merged_text)
|
||||
_good_splits = []
|
||||
if not new_separators:
|
||||
final_chunks.append(s)
|
||||
else:
|
||||
other_info = self._split_text(s, new_separators)
|
||||
final_chunks.extend(other_info)
|
||||
if _good_splits:
|
||||
merged_text = self._merge_splits(_good_splits, _separator)
|
||||
final_chunks.extend(merged_text)
|
||||
return [
|
||||
re.sub(r"\n{2,}", "\n", chunk.strip())
|
||||
for chunk in final_chunks
|
||||
if chunk.strip() != ""
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text_splitter = ChineseRecursiveTextSplitter(
|
||||
keep_separator=True, is_separator_regex=True, chunk_size=50, chunk_overlap=0
|
||||
)
|
||||
ls = [
|
||||
"""中国对外贸易形势报告(75页)。前 10 个月,一般贸易进出口 19.5 万亿元,增长 25.1%, 比整体进出口增速高出 2.9 个百分点,占进出口总额的 61.7%,较去年同期提升 1.6 个百分点。其中,一般贸易出口 10.6 万亿元,增长 25.3%,占出口总额的 60.9%,提升 1.5 个百分点;进口8.9万亿元,增长24.9%,占进口总额的62.7%, 提升 1.8 个百分点。加工贸易进出口 6.8 万亿元,增长 11.8%, 占进出口总额的 21.5%,减少 2.0 个百分点。其中,出口增 长 10.4%,占出口总额的 24.3%,减少 2.6 个百分点;进口增 长 14.2%,占进口总额的 18.0%,减少 1.2 个百分点。此外, 以保税物流方式进出口 3.96 万亿元,增长 27.9%。其中,出 口 1.47 万亿元,增长 38.9%;进口 2.49 万亿元,增长 22.2%。前三季度,中国服务贸易继续保持快速增长态势。服务 进出口总额 37834.3 亿元,增长 11.6%;其中服务出口 17820.9 亿元,增长 27.3%;进口 20013.4 亿元,增长 0.5%,进口增 速实现了疫情以来的首次转正。服务出口增幅大于进口 26.8 个百分点,带动服务贸易逆差下降 62.9%至 2192.5 亿元。服 务贸易结构持续优化,知识密集型服务进出口 16917.7 亿元, 增长 13.3%,占服务进出口总额的比重达到 44.7%,提升 0.7 个百分点。 二、中国对外贸易发展环境分析和展望 全球疫情起伏反复,经济复苏分化加剧,大宗商品价格 上涨、能源紧缺、运力紧张及发达经济体政策调整外溢等风 险交织叠加。同时也要看到,我国经济长期向好的趋势没有 改变,外贸企业韧性和活力不断增强,新业态新模式加快发 展,创新转型步伐提速。产业链供应链面临挑战。美欧等加快出台制造业回迁计 划,加速产业链供应链本土布局,跨国公司调整产业链供应 链,全球双链面临新一轮重构,区域化、近岸化、本土化、 短链化趋势凸显。疫苗供应不足,制造业“缺芯”、物流受限、 运价高企,全球产业链供应链面临压力。 全球通胀持续高位运行。能源价格上涨加大主要经济体 的通胀压力,增加全球经济复苏的不确定性。世界银行今年 10 月发布《大宗商品市场展望》指出,能源价格在 2021 年 大涨逾 80%,并且仍将在 2022 年小幅上涨。IMF 指出,全 球通胀上行风险加剧,通胀前景存在巨大不确定性。""",
|
||||
]
|
||||
# text = """"""
|
||||
for inum, text in enumerate(ls):
|
||||
print(inum)
|
||||
chunks = text_splitter.split_text(text)
|
||||
for chunk in chunks:
|
||||
print(chunk)
|
||||
26
modules/python/app/tts.py
Normal file
26
modules/python/app/tts.py
Normal file
@@ -0,0 +1,26 @@
|
||||
#
|
||||
#
|
||||
import logging
|
||||
from fastapi import APIRouter, Request
|
||||
# from sse_starlette import EventSourceResponse
|
||||
import edge_tts
|
||||
|
||||
# https://github.com/rany2/edge-tts/blob/master/README.md
|
||||
# https://tts.byylook.com/ai/text-to-speech?source=github
|
||||
router = APIRouter(
|
||||
prefix='/tts',
|
||||
tags=['tts v1 apis']
|
||||
)
|
||||
|
||||
#
|
||||
# https://github.com/rany2/edge-tts/blob/master/examples/basic_generation.py
|
||||
# # http://127.0.0.1:9007/api/v1/tts/test
|
||||
# 列出音色:edge-tts --list-voices
|
||||
@router.get("/test")
|
||||
async def tts():
|
||||
TEXT = "Hello World!"
|
||||
VOICE = "en-GB-SoniaNeural"
|
||||
OUTPUT_FILE = "test.mp3"
|
||||
communicate = edge_tts.Communicate(TEXT, VOICE)
|
||||
await communicate.save(OUTPUT_FILE)
|
||||
return "ok"
|
||||
49
modules/python/app/utils.py
Normal file
49
modules/python/app/utils.py
Normal file
@@ -0,0 +1,49 @@
|
||||
'''
|
||||
Author: jackning 270580156@qq.com
|
||||
Date: 2024-08-29 09:55:35
|
||||
LastEditors: jackning 270580156@qq.com
|
||||
LastEditTime: 2024-08-29 15:10:19
|
||||
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
|
||||
Please be aware of the BSL license restrictions before installing Bytedesk IM –
|
||||
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
|
||||
仅支持企业内部员工自用,严禁私自用于销售、二次销售或者部署SaaS方式销售
|
||||
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
|
||||
contact: 270580156@qq.com
|
||||
技术/商务联系:270580156@qq.com
|
||||
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
|
||||
'''
|
||||
#
|
||||
import os
|
||||
import requests
|
||||
|
||||
def download_file(file_url: str, destination_folder: str = 'files') -> str | None:
|
||||
"""
|
||||
下载文件到指定文件夹。
|
||||
|
||||
:param file_url: 要下载的文件的URL。
|
||||
:param destination_folder: 保存文件的文件夹名称,默认为'file'。
|
||||
"""
|
||||
# 确保目标文件夹存在,如果不存在则创建
|
||||
if not os.path.exists(destination_folder):
|
||||
os.makedirs(destination_folder)
|
||||
|
||||
# 从URL中获取文件名
|
||||
file_name = file_url.split('/')[-1]
|
||||
|
||||
# 构建完整的文件保存路径
|
||||
file_path = os.path.join(destination_folder, file_name)
|
||||
|
||||
# 使用requests库下载文件
|
||||
with requests.get(file_url, stream=True) as response:
|
||||
if response.status_code == 200:
|
||||
# 以二进制写入模式打开文件
|
||||
with open(file_path, 'wb') as file:
|
||||
# 分块写入文件内容
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
file.write(chunk)
|
||||
print(f"文件已成功下载到: {file_path}")
|
||||
else:
|
||||
print(f"下载失败,状态码: {response.status_code}")
|
||||
|
||||
# 返回文件保存路径
|
||||
return file_path
|
||||
Reference in New Issue
Block a user