Sync from bytedesk-private: update

This commit is contained in:
jack ning
2024-12-14 10:43:18 +08:00
parent 476eebb101
commit 5e082909e4
3421 changed files with 812709 additions and 0 deletions

View File

24
modules/python/app/api.py Normal file
View File

@@ -0,0 +1,24 @@
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 10:35:52
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-29 10:38:28
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
#
from fastapi import APIRouter
from app import chat, config, doc, tts
#
api_v1_router = APIRouter()
api_v1_router.include_router(chat.router)
api_v1_router.include_router(config.router)
api_v1_router.include_router(doc.router)
api_v1_router.include_router(tts.router)

View File

@@ -0,0 +1,47 @@
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 09:55:35
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-31 19:19:57
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
#
import logging
from fastapi import APIRouter, Request
from app.redisVector import myredisVector
router = APIRouter(
prefix='/chat',
tags=['chat v1 apis']
)
# http://127.0.0.1:9007/api/v1/chat/query?kbuid=1461090177253570&query=报名条件
# http://127.0.0.1:9007/api/v1/chat/query?kbuid=1461487033909519&query=DataStructure
@router.get("/query")
def query(kbuid: str, query: str):
# 测试搜索结果
search_results = myredisVector.search_docs(kbUid=kbuid, query=query)
# search_results = myredisVector.search_as_retriever(kbUid=kbuid, query=query)
return {
"results": search_results
}
# # http://127.0.0.1:9007/api/v1/chat/stream?kbuid=1461090177253570&query=报名条件
# # http://127.0.0.1:9007/api/v1/chat/stream?kbuid=1461487033909519&query=DataStructure
# @router.get("/stream")
# async def query(kbuid: str, query: str):
# logging.info(f'stream: {kbuid}, {query}')
# # TODO: query from db/cache, if match then return, if not then goto llm
# search_results = myredisVector.search_docs(kbUid=kbuid, query=query)
# logging.info(f'搜索结果: count={ len(search_results) }')
# await myredisVector.query_llm(messageUid='', threadTopic='', kbUid=kbuid, question=query, search_results=search_results)
# return {
# 'message': 'ok'
# }

View File

@@ -0,0 +1,76 @@
'''
Author: jackning 270580156@qq.com
Date: 2023-12-26 11:20:33
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-31 19:18:09
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
from functools import lru_cache
from typing import Annotated
from fastapi import APIRouter, Depends
from pydantic_settings import BaseSettings
# https://fastapi.tiangolo.com/zh/advanced/settings/#pydantic-settings
class Settings(BaseSettings):
DEBUG: bool
API_V1_PREFIX: str
#
EMBEDDINGS_PATH: str
#
ZHIPU_API_KEY: str
# 连接MySQL数据库
DATABASE_URL: str
ASYNC_DATABASE_URL: str
# REDIS
REDIS_HOST: str
REDIS_PORT: int
REDIS_PASSWORD: str
REDIS_URL: str
REDIS_KEY_PREFIX: str
REDIS_INDEX_NAME: str
IS_VECTOR_STORE_INITIATED: str
# https://docs.pydantic.dev/latest/api/config/
# https://fastapi.tiangolo.com/zh/advanced/settings/#env_1
class Config:
case_sensitive = False
# 配置环境变量文件
env_file = ".env"
#
# settings = Settings()
@lru_cache(maxsize=32)
def get_settings():
return Settings()
router = APIRouter(
prefix='/settings',
tags=['settings v1 apis']
)
# # http://127.0.0.1:9007/api/v1/settings/info
# @router.get("/info")
# async def info(settings: Annotated[Settings, Depends(get_settings)]):
# print('cache info: ', get_settings.cache_info(), settings.API_V1_PREFIX)
# if (settings.DEBUG):
# return {
# "env": settings.model_dump()
# }
# return {
# "env": 'None'
# }
# # http://127.0.0.1:9007/api/v1/settings/clear_cache
# # 添加一个清除缓存的路由
# @router.get("/clear_cache")
# def clear_cache():
# print('cache info: ', get_settings.cache_info())
# get_settings.cache_clear() # 清除get_settings函数的缓存
# return {"message": "Cache cleared"}

View File

@@ -0,0 +1,31 @@
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 18:23:40
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-30 16:54:39
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
#
pubsubChannel: str = 'bytedeskim:pubsub'
# FIXME: 需要统一java服务器编解码 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xac in position 0: invalid start byte
# pubsubObjectChannel: str = 'bytedeskim:pubsub_object'
#
PARSE_FILE: str = 'PARSE_FILE'
PARSE_FILE_SUCCESS: str = 'PARSE_FILE_SUCCESS'
PARSE_FILE_ERROR: str = 'PARSE_FILE_ERROR'
#
DELETE_FILE: str = 'DELETE_FILE'
DELETE_FILE_SUCCESS: str = 'DELETE_FILE_SUCCESS'
DELETE_FILE_ERROR: str = 'DELETE_FILE_ERROR'
#
QUESTION: str = 'QUESTION'
ANSWER: str = 'ANSWER'
ANSWER_FINISHED: str = 'ANSWER_FINISHED'

111
modules/python/app/doc.py Normal file
View File

@@ -0,0 +1,111 @@
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 09:55:30
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-31 07:07:31
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
#
import logging
import uuid
from typing import List
from fastapi import APIRouter
from app.redisVector import myredisVector
from langchain_community.document_loaders import (
PyPDFLoader,
TextLoader,
UnstructuredWordDocumentLoader,
CSVLoader,
UnstructuredMarkdownLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredImageLoader,
UnstructuredExcelLoader,
UnstructuredXMLLoader,
UnstructuredRTFLoader
)
# from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_unstructured import UnstructuredLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
from app.textsplitter.chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
from langchain.docstore.document import Document
# https://python.langchain.com/v0.2/docs/integrations/document_loaders/pypdfloader/
# https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/
def load_files(filePath: str):
if (filePath.endswith(".pdf")):
loader = PyPDFLoader(filePath)
elif (filePath.endswith(".txt")):
loader = TextLoader(filePath)
elif (filePath.endswith(".doc") or filePath.endswith(".docx")):
loader = UnstructuredWordDocumentLoader(filePath)
elif (filePath.endswith(".md")):
loader = UnstructuredMarkdownLoader(filePath)
elif (filePath.endswith(".html")):
loader = UnstructuredHTMLLoader(filePath)
elif (filePath.endswith(".png") or filePath.endswith(".jpg")) or filePath.endswith(".jpeg"):
loader = UnstructuredImageLoader(filePath)
elif (filePath.endswith(".xlsx")):
loader = UnstructuredExcelLoader(filePath)
elif (filePath.endswith(".csv")):
loader = CSVLoader(filePath)
elif (filePath.endswith(".xml")):
loader = UnstructuredXMLLoader(filePath)
elif (filePath.endswith(".rtf")):
loader = UnstructuredRTFLoader(filePath)
elif (filePath.endswith(".epub")):
loader = UnstructuredEPubLoader(filePath)
else:
loader = UnstructuredLoader(filePath)
docs = loader.load()
# print(docs[0].metadata)
return docs
def split_docs(docs: List[Document]) -> List[Document]:
# Load example document
text_splitter = ChineseRecursiveTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len,
is_separator_regex=False,
)
texts = text_splitter.split_documents(docs)
# print(texts[0])
return texts
# 定义一个回调函数来处理接收到的消息
def load_and_parse(fileUid: str, filePath: str, kbUid: str) -> List[str]:
logging.info(f"load_and_parse: {filePath}")
# 解析
docs = load_files(filePath)
logging.info(f"Loaded {len(docs)} documents")
# 分块
splited_texts = split_docs(docs)
for doc in splited_texts:
doc.metadata["uid"] = str(uuid.uuid4().hex)
doc.metadata["file_uid"] = fileUid
doc.metadata["kb_uid"] = kbUid
logging.info(f"Split into {len(splited_texts)} chunks")
# 存储到redis
docIds = myredisVector.add_docs(splited_texts)
logging.info(f"Stored in redis")
return docIds
router = APIRouter(
prefix='/docs',
tags=['docs v1 apis']
)
@router.get("/test")
def test():
return {"docs": "test"}

110
modules/python/app/redis.py Normal file
View File

@@ -0,0 +1,110 @@
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 09:55:35
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-31 11:40:51
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
#
# import time
import logging
from typing import List
import redis
import json
import uuid
from app.redisPubsub import defaultPublish, publishDeleteFileError, publishDeleteFileSuccess, publishParseFileSuccess
from app.redisVector import myredisVector
from app.config import get_settings
from app.doc import load_and_parse
from app.utils import download_file
from app.consts import DELETE_FILE, PARSE_FILE, QUESTION, pubsubChannel
redisClient = redis.Redis(host=get_settings().REDIS_HOST,
password=get_settings().REDIS_PASSWORD,
port=get_settings().REDIS_PORT,
decode_responses=True)
def defaultSubscribe() -> None:
subscribe(pubsubChannel)
# subscribe(pubsubObjectChannel)
def subscribe(channel):
logging.info(f'subscribe channel: {channel}')
pubsub = redisClient.pubsub()
pubsub.subscribe(channel)
pubsub.subscribe(**{channel: on_message})
pubsub.run_in_thread(0.1)
#
def on_message(message):
# {"content":"http://127.0.0.1:9003/file/240828150817_北京软考通知.pdf","type":"UPLOAD_FILE"}
# logging.info(f"on_message Received message: {message['data']}")
# 解析消息字符串为字典
data_dict = json.loads(message['data'])
# 获取content和type的内容
type_ = data_dict.get('type')
content = data_dict.get('content')
content_dict = json.loads(content)
if type_ == PARSE_FILE:
fileUid = content_dict.get('fileUid')
fileUrl = content_dict.get('fileUrl')
kbUid = content_dict.get('kbUid')
# 打印获取到的content和type
logging.info(f"Received type: {type_}, {fileUid}, {fileUrl}, {kbUid}")
parse_file(fileUid, fileUrl, kbUid)
elif type_ == DELETE_FILE:
fileUid = content_dict.get('fileUid')
docIds = content_dict.get('docIds')
logging.info(f"Received type: {type_}, {fileUid}, {docIds}")
delete_docs(fileUid=fileUid, docIds=docIds)
elif type_ == QUESTION:
uid = content_dict.get('uid')
threadTopic = content_dict.get('threadTopic')
kbUid = content_dict.get('kbUid')
question = content_dict.get('question')
# 打印获取到的content和type
logging.info(f"Received type: {type_}, {threadTopic}, {kbUid}, {question}")
search_results = myredisVector.search_docs(kbUid=kbUid, query=question)
myredisVector.query_llm(messageUid=uid, threadTopic=threadTopic, kbUid=kbUid, question=question, search_results=search_results)
# else:
# logging.info(f"Received unknown type: {type_}")
def parse_file(fileUid: str, fileUrl: str, kbUid: str):
# 下载文件到file文件夹
# 如果fileUrl存在则下载文件
if fileUrl:
filePath = download_file(fileUrl)
logging.info(f"Saved file: {filePath}")
# 解析
docIds = load_and_parse(fileUid=fileUid, filePath=filePath, kbUid=kbUid)
#
publishParseFileSuccess(fileUid=fileUid, docIds=docIds)
return
def delete_docs(fileUid: str, docIds: List[str]):
# logging.info(f"delete fileUid: {fileUid}, {docIds}")
result = myredisVector.delete_docs(docIds=docIds)
#
if result:
logging.info(f"delete fileUid success: {result} {docIds}")
publishDeleteFileSuccess(fileUid=fileUid)
else:
logging.info(f"delete fileUid fail: {result} {docIds}")
publishDeleteFileError(fileUid=fileUid, errorMsg="delete fail")
return
def setKey(key: str, value: str):
redisClient.set(key, value)
def getKey(key: str):
redisClient.get(key)

View File

@@ -0,0 +1,146 @@
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 18:21:14
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-31 10:18:58
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
import json
import logging
from typing import List
import redis
from app.config import get_settings
from app.consts import DELETE_FILE_ERROR, DELETE_FILE_SUCCESS, PARSE_FILE_ERROR, PARSE_FILE_SUCCESS, ANSWER, ANSWER_FINISHED, pubsubChannel
#
redisClient = redis.Redis(host=get_settings().REDIS_HOST,
password=get_settings().REDIS_PASSWORD,
port=get_settings().REDIS_PORT,
decode_responses=True)
#
def publishParseFileSuccess(fileUid: str, docIds: List[str]) -> None:
content = json.dumps({
"fileUid": fileUid,
"docIds": docIds
})
message = json.dumps({
"type": PARSE_FILE_SUCCESS,
"content": content,
}, ensure_ascii=False)
defaultPublish(content=message)
def publishParseFileError(fileUid: str, errorMsg: str) -> None:
content = json.dumps({
"fileUid": fileUid,
"errorMsg": errorMsg,
})
message = json.dumps({
"type": PARSE_FILE_ERROR,
"content": content,
}, ensure_ascii=False)
defaultPublish(content=message)
def publishDeleteFileSuccess(fileUid: str) -> None:
content = json.dumps({
"fileUid": fileUid,
})
message = json.dumps({
"type": DELETE_FILE_SUCCESS,
"content": content,
}, ensure_ascii=False)
defaultPublish(content=message)
def publishDeleteFileError(fileUid: str, errorMsg: str) -> None:
content = json.dumps({
"fileUid": fileUid,
"errorMsg": errorMsg,
})
message = json.dumps({
"type": DELETE_FILE_ERROR,
"content": content,
}, ensure_ascii=False)
defaultPublish(content=message)
def publishAnswerMessage(
id: int,
uid: str,
threadTopic: str,
kbUid: str,
question: str,
answer: str,
model: str,
created: int) -> None:
#
content = json.dumps({
"id": id,
"uid": uid,
"threadTopic": threadTopic,
"kbUid": kbUid,
"question": question,
"answer": answer,
"model": model,
"created": created
})
message = json.dumps({
"type": ANSWER,
"content": content,
}, ensure_ascii=False)
defaultPublish(content=message)
return
def publishAnswerFinished(
id: int,
uid: str,
threadTopic: str,
kbUid: str,
question: str,
answer: str,
model: str,
created: int,
promptTokens: str,
completionTokens: str,
totalTokens: str) -> None:
#
content = json.dumps({
"id": id,
"uid": uid,
"threadTopic": threadTopic,
"kbUid": kbUid,
"question": question,
"answer": answer,
"model": model,
"created": created,
"promptTokens": promptTokens,
"completionTokens": completionTokens,
"totalTokens": totalTokens
})
message = json.dumps({
"type": ANSWER_FINISHED,
"content": content,
}, ensure_ascii=False)
defaultPublish(content=message)
return
#
def defaultPublish(content: str) -> None:
publish(pubsubChannel, content)
def publish(channel, message):
# logging.info(f'publish {message} to channel: {channel}')
redisClient.publish(channel, message)

View File

@@ -0,0 +1,194 @@
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 14:49:54
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-09-03 09:14:22
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
#
import logging
from typing import List
from langchain_redis import RedisConfig, RedisVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.redis.filters import RedisFilter
from langchain.docstore.document import Document
from redisvl.query.filter import Tag
from zhipuai import ZhipuAI
from app.config import get_settings
from app.redisPubsub import publishAnswerMessage, publishAnswerFinished
zhipuAi = ZhipuAI(api_key=get_settings().ZHIPU_API_KEY) # 填写您自己的APIKey
redis_index_schema = {
"tag": [{"name": "kb_uid"}, {"name": "file_uid"}],
}
# https://python.langchain.com/v0.2/docs/integrations/vectorstores/redis/
class MyRedisVector:
#
embeddings: HuggingFaceEmbeddings
vector_store: RedisVectorStore
#
def __init__(self):
self.embeddings = HuggingFaceEmbeddings(
model_name=get_settings().EMBEDDINGS_PATH)
#
config = RedisConfig(
index_name=get_settings().REDIS_INDEX_NAME,
key_prefix=get_settings().REDIS_KEY_PREFIX,
redis_url=get_settings().REDIS_URL,
redis_index_schema=redis_index_schema,
metadata_schema=[
{
"name": "kb_uid",
"type": "tag",
},
{
"name": "file_uid",
"type": "tag",
}
]
)
self.vector_store = RedisVectorStore(self.embeddings, config=config)
return
def add_docs(self, docs: List[Document]) -> List[str]:
# 将docs中的文档存储到vector_store中
results = self.vector_store.add_documents(docs)
logging.info(f"add_texts result: {results}")
return results
def delete_docs(self, docIds: List[str]) -> int:
# 将docs从vector_store中删除
# 判断docIds是否为空空则返回0
if not docIds:
return 0
# 返回被删除的文档数量
return self.vector_store.index.drop_keys(docIds)
# FIXME: RuntimeError: Index has not been created. Must be created before calling search
def search_docs(self, kbUid: str, query: str) -> List[Document]:
kb_filter = Tag("kb_uid") == kbUid
results = self.vector_store.similarity_search(
query, k=3, filter=kb_filter)
# logging.info(f'search_store results {results}, {len(results)}')
return results
# 4.2 使用Retriever在向量库中搜索问题
# https://python.langchain.com/docs/integrations/vectorstores/redis#redis-as-retriever
def search_as_retriever(self, kbUid: str, query: str) -> List[Document]:
# query向量化langchain文档demo中没有向量化
# kb_filter = RedisFilter.tag("kb_uid") == kbUid
kb_filter = Tag("kb_uid") == kbUid
# 有三种搜索算法
# 1 默认算法
retriever = self.vector_store.as_retriever(
search_type="similarity",
search_kwargs={
"k": 3,
"filter": kb_filter
}
)
# 2 similarity_distance_threshold retriever which allows the user to specify the vector distance
# retriever = self.vector_store.as_retriever(
# search_type="similarity_distance_threshold",
# search_kwargs={"k": 3, "distance_threshold": 0.1,
# "filter": kb_filter},
# )
# 3 the similarity_score_threshold allows the user to define the minimum score for similar documents
# retriever = self.vector_store.as_retriever(
# search_type="similarity_score_threshold",
# search_kwargs={"score_threshold": 0.9,
# "k": 3, "filter": kb_filter},
# )
#
results = retriever.get_relevant_documents(query)
print(f'search_as_retriever results {results}, {len(results)}')
return results
# 5.Generation 将问题和搜索结果传给大模型,返回答案
def query_llm(self, messageUid: str, threadTopic: str, kbUid: str, question: str, search_results: List[Document]):
# logging.info(f'query_llm search_results {search_results}, {len(search_results)}')
# 拼接
# context = "\n".join(search_results)
context = "\n".join([doc.page_content for doc in search_results])
# logging.info(f'query_llm context {context}')
# 基于本地知识问答的提示词模
# TODO: write an english version
# <问题 > {query} < /问题 >
prompt_template = f'''
### Human: <指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,不允许在答案中添加编造成分,答案请使用跟问题中同样的语言。 </指令>
<已知信息>{ context }</已知信息>###
Assistant:'''
print(f'prompt_template: { { prompt_template } }')
# https://open.bigmodel.cn/dev/api#sdk_example
response = zhipuAi.chat.completions.create(
model="glm-4-flash", # 免费版
# prompt=[{"role": "user", "content": prompt_template}],
messages=[
{"role": "system", "content": prompt_template},
{"role": "user", "content": question},
],
top_p=0.7,
temperature=0.3,
stream=True
)
#
counter = 0
for chunk in response:
# ChatCompletionChunk(id='202408291708554ae5d8eac6e94630', choices=[Choice(delta=ChoiceDelta(content='凡', role='assistant', tool_calls=None), finish_reason=None, index=0)], created=1724922535, model='glm-4-air', usage=None, extra_json=None)
# ChatCompletionChunk(id='202408291708554ae5d8eac6e94630', choices=[Choice(delta=ChoiceDelta(content='', role='assistant', tool_calls=None), finish_reason='stop', index=0)], created=1724922535, model='glm-4-air', usage=CompletionUsage(prompt_tokens=453, completion_tokens=34, total_tokens=487), extra_json=None)
# print(f'chunk: { chunk }')
# INFO: query_llm chunk: , stop, glm-4-air, CompletionUsage(prompt_tokens=453, completion_tokens=34, total_tokens=487)
#
counter += 1
id_to_publish = counter
#
answer = chunk.choices[0].delta.content
model = chunk.model
created = chunk.created
finish_reason = chunk.choices[0].finish_reason
logging.info(
f'query_llm: {counter} {chunk.choices[0].delta.content}, {chunk.choices[0].finish_reason}, {chunk.model}, {chunk.created}, {chunk.usage}')
if finish_reason == 'stop':
promptTokens = chunk.usage.prompt_tokens
completionTokens = chunk.usage.completion_tokens
totalTokens = chunk.usage.total_tokens
publishAnswerFinished(
id=id_to_publish,
uid=messageUid,
threadTopic=threadTopic,
kbUid=kbUid,
question=question,
answer=answer,
model=model,
created=created,
promptTokens=promptTokens,
completionTokens=completionTokens,
totalTokens=totalTokens
)
else:
publishAnswerMessage(
id=id_to_publish,
uid=messageUid,
threadTopic=threadTopic,
kbUid=kbUid,
question=question,
answer=answer,
model=model,
created=created
)
myredisVector = MyRedisVector()
#

View File

@@ -0,0 +1,107 @@
import logging
import re
from typing import Any, List, Optional
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from chatchat.utils import build_logger
# logger = build_logger()
def _split_text_with_regex_from_end(
text: str, separator: str, keep_separator: bool
) -> List[str]:
# Now that we have the separator, split the text
if separator:
if keep_separator:
# The parentheses in the pattern keep the delimiters in the result.
_splits = re.split(f"({separator})", text)
splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
if len(_splits) % 2 == 1:
splits += _splits[-1:]
# splits = [_splits[0]] + splits
else:
splits = re.split(separator, text)
else:
splits = list(text)
return [s for s in splits if s != ""]
class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
def __init__(
self,
separators: Optional[List[str]] = None,
keep_separator: bool = True,
is_separator_regex: bool = True,
**kwargs: Any,
) -> None:
"""Create a new TextSplitter."""
super().__init__(keep_separator=keep_separator, **kwargs)
self._separators = separators or [
"\n\n",
"\n",
"。||",
"\.\s|\!\s|\?\s",
"|;\s",
"|,\s",
]
self._is_separator_regex = is_separator_regex
def _split_text(self, text: str, separators: List[str]) -> List[str]:
"""Split incoming text and return chunks."""
final_chunks = []
# Get appropriate separator to use
separator = separators[-1]
new_separators = []
for i, _s in enumerate(separators):
_separator = _s if self._is_separator_regex else re.escape(_s)
if _s == "":
separator = _s
break
if re.search(_separator, text):
separator = _s
new_separators = separators[i + 1 :]
break
_separator = separator if self._is_separator_regex else re.escape(separator)
splits = _split_text_with_regex_from_end(text, _separator, self._keep_separator)
# Now go merging things, recursively splitting longer texts.
_good_splits = []
_separator = "" if self._keep_separator else separator
for s in splits:
if self._length_function(s) < self._chunk_size:
_good_splits.append(s)
else:
if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text)
_good_splits = []
if not new_separators:
final_chunks.append(s)
else:
other_info = self._split_text(s, new_separators)
final_chunks.extend(other_info)
if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text)
return [
re.sub(r"\n{2,}", "\n", chunk.strip())
for chunk in final_chunks
if chunk.strip() != ""
]
if __name__ == "__main__":
text_splitter = ChineseRecursiveTextSplitter(
keep_separator=True, is_separator_regex=True, chunk_size=50, chunk_overlap=0
)
ls = [
"""中国对外贸易形势报告75页。前 10 个月,一般贸易进出口 19.5 万亿元,增长 25.1% 比整体进出口增速高出 2.9 个百分点,占进出口总额的 61.7%,较去年同期提升 1.6 个百分点。其中,一般贸易出口 10.6 万亿元,增长 25.3%,占出口总额的 60.9%,提升 1.5 个百分点进口8.9万亿元增长24.9%占进口总额的62.7% 提升 1.8 个百分点。加工贸易进出口 6.8 万亿元,增长 11.8% 占进出口总额的 21.5%,减少 2.0 个百分点。其中,出口增 长 10.4%,占出口总额的 24.3%,减少 2.6 个百分点;进口增 长 14.2%,占进口总额的 18.0%,减少 1.2 个百分点。此外, 以保税物流方式进出口 3.96 万亿元,增长 27.9%。其中,出 口 1.47 万亿元,增长 38.9%;进口 2.49 万亿元,增长 22.2%。前三季度,中国服务贸易继续保持快速增长态势。服务 进出口总额 37834.3 亿元,增长 11.6%;其中服务出口 17820.9 亿元,增长 27.3%;进口 20013.4 亿元,增长 0.5%,进口增 速实现了疫情以来的首次转正。服务出口增幅大于进口 26.8 个百分点,带动服务贸易逆差下降 62.9%至 2192.5 亿元。服 务贸易结构持续优化,知识密集型服务进出口 16917.7 亿元, 增长 13.3%,占服务进出口总额的比重达到 44.7%,提升 0.7 个百分点。 二、中国对外贸易发展环境分析和展望 全球疫情起伏反复,经济复苏分化加剧,大宗商品价格 上涨、能源紧缺、运力紧张及发达经济体政策调整外溢等风 险交织叠加。同时也要看到,我国经济长期向好的趋势没有 改变,外贸企业韧性和活力不断增强,新业态新模式加快发 展,创新转型步伐提速。产业链供应链面临挑战。美欧等加快出台制造业回迁计 划,加速产业链供应链本土布局,跨国公司调整产业链供应 链,全球双链面临新一轮重构,区域化、近岸化、本土化、 短链化趋势凸显。疫苗供应不足,制造业“缺芯”、物流受限、 运价高企,全球产业链供应链面临压力。 全球通胀持续高位运行。能源价格上涨加大主要经济体 的通胀压力,增加全球经济复苏的不确定性。世界银行今年 10 月发布《大宗商品市场展望》指出,能源价格在 2021 年 大涨逾 80%,并且仍将在 2022 年小幅上涨。IMF 指出,全 球通胀上行风险加剧,通胀前景存在巨大不确定性。""",
]
# text = """"""
for inum, text in enumerate(ls):
print(inum)
chunks = text_splitter.split_text(text)
for chunk in chunks:
print(chunk)

26
modules/python/app/tts.py Normal file
View File

@@ -0,0 +1,26 @@
#
#
import logging
from fastapi import APIRouter, Request
# from sse_starlette import EventSourceResponse
import edge_tts
# https://github.com/rany2/edge-tts/blob/master/README.md
# https://tts.byylook.com/ai/text-to-speech?source=github
router = APIRouter(
prefix='/tts',
tags=['tts v1 apis']
)
#
# https://github.com/rany2/edge-tts/blob/master/examples/basic_generation.py
# # http://127.0.0.1:9007/api/v1/tts/test
# 列出音色edge-tts --list-voices
@router.get("/test")
async def tts():
TEXT = "Hello World!"
VOICE = "en-GB-SoniaNeural"
OUTPUT_FILE = "test.mp3"
communicate = edge_tts.Communicate(TEXT, VOICE)
await communicate.save(OUTPUT_FILE)
return "ok"

View File

@@ -0,0 +1,49 @@
'''
Author: jackning 270580156@qq.com
Date: 2024-08-29 09:55:35
LastEditors: jackning 270580156@qq.com
LastEditTime: 2024-08-29 15:10:19
Description: bytedesk.com https://github.com/Bytedesk/bytedesk
Please be aware of the BSL license restrictions before installing Bytedesk IM
selling, reselling, or hosting Bytedesk IM as a service is a breach of the terms and automatically terminates your rights under the license.
仅支持企业内部员工自用严禁私自用于销售、二次销售或者部署SaaS方式销售
Business Source License 1.1: https://github.com/Bytedesk/bytedesk/blob/main/LICENSE
contact: 270580156@qq.com
技术/商务联系270580156@qq.com
Copyright (c) 2024 by bytedesk.com, All Rights Reserved.
'''
#
import os
import requests
def download_file(file_url: str, destination_folder: str = 'files') -> str | None:
"""
下载文件到指定文件夹。
:param file_url: 要下载的文件的URL。
:param destination_folder: 保存文件的文件夹名称,默认为'file'
"""
# 确保目标文件夹存在,如果不存在则创建
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
# 从URL中获取文件名
file_name = file_url.split('/')[-1]
# 构建完整的文件保存路径
file_path = os.path.join(destination_folder, file_name)
# 使用requests库下载文件
with requests.get(file_url, stream=True) as response:
if response.status_code == 200:
# 以二进制写入模式打开文件
with open(file_path, 'wb') as file:
# 分块写入文件内容
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"文件已成功下载到: {file_path}")
else:
print(f"下载失败,状态码: {response.status_code}")
# 返回文件保存路径
return file_path