mirror of
https://gitcode.com/gh_mirrors/eas/EasyFace.git
synced 2025-12-30 13:02:29 +00:00
83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
import re
|
|
import string
|
|
|
|
from zhconv import convert
|
|
|
|
CHINESE_PUNCTUATION = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。'
|
|
ENGLISH_PUNCTUATION = string.punctuation
|
|
|
|
|
|
def remove_space_between_chinese_chars(decoded_str: str):
|
|
old_word_list = decoded_str.split(' ')
|
|
new_word_list = []
|
|
start = -1
|
|
for i, word in enumerate(old_word_list):
|
|
if _is_chinese_str(word):
|
|
if start == -1:
|
|
start = i
|
|
else:
|
|
if start != -1:
|
|
new_word_list.append(''.join(old_word_list[start:i]))
|
|
start = -1
|
|
new_word_list.append(word)
|
|
if start != -1:
|
|
new_word_list.append(''.join(old_word_list[start:]))
|
|
return ' '.join(new_word_list).strip()
|
|
|
|
|
|
# add space for each chinese char
|
|
def rebuild_chinese_str(string: str):
|
|
return ' '.join(''.join([
|
|
f' {char} '
|
|
if _is_chinese_char(char) or char in CHINESE_PUNCTUATION else char
|
|
for char in string
|
|
]).split())
|
|
|
|
|
|
def _is_chinese_str(string: str) -> bool:
|
|
return all(
|
|
_is_chinese_char(cp) or cp in CHINESE_PUNCTUATION
|
|
or cp in ENGLISH_PUNCTUATION or cp for cp in string)
|
|
|
|
|
|
def _is_chinese_char(cp: str) -> bool:
|
|
"""Checks whether CP is the codepoint of a CJK character."""
|
|
cp = ord(cp)
|
|
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
|
|
or (cp >= 0x20000 and cp <= 0x2A6DF)
|
|
or (cp >= 0x2A700 and cp <= 0x2B73F)
|
|
or (cp >= 0x2B740 and cp <= 0x2B81F)
|
|
or (cp >= 0x2B820 and cp <= 0x2CEAF)
|
|
or (cp >= 0xF900 and cp <= 0xFAFF)
|
|
or (cp >= 0x2F800 and cp <= 0x2FA1F)):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def normalize_chinese_number(text):
|
|
chinese_number = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
|
|
new_text = ''
|
|
for x in text:
|
|
if x in '0123456789':
|
|
x = chinese_number[0]
|
|
new_text += x
|
|
new_text = convert(new_text, 'zh-hans')
|
|
return new_text
|
|
|
|
|
|
def pre_chinese(text, max_words):
|
|
|
|
text = text.lower().replace(CHINESE_PUNCTUATION,
|
|
' ').replace(ENGLISH_PUNCTUATION, ' ')
|
|
text = re.sub(
|
|
r'\s{2,}',
|
|
' ',
|
|
text,
|
|
)
|
|
text = text.rstrip('\n')
|
|
text = text.strip(' ')[:max_words]
|
|
return text
|