Files
EasyFace/modelscope/utils/chinese_utils.py
2023-03-02 11:17:26 +08:00

83 lines
2.5 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
import re
import string
from zhconv import convert
CHINESE_PUNCTUATION = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。'
ENGLISH_PUNCTUATION = string.punctuation
def remove_space_between_chinese_chars(decoded_str: str):
old_word_list = decoded_str.split(' ')
new_word_list = []
start = -1
for i, word in enumerate(old_word_list):
if _is_chinese_str(word):
if start == -1:
start = i
else:
if start != -1:
new_word_list.append(''.join(old_word_list[start:i]))
start = -1
new_word_list.append(word)
if start != -1:
new_word_list.append(''.join(old_word_list[start:]))
return ' '.join(new_word_list).strip()
# add space for each chinese char
def rebuild_chinese_str(string: str):
return ' '.join(''.join([
f' {char} '
if _is_chinese_char(char) or char in CHINESE_PUNCTUATION else char
for char in string
]).split())
def _is_chinese_str(string: str) -> bool:
return all(
_is_chinese_char(cp) or cp in CHINESE_PUNCTUATION
or cp in ENGLISH_PUNCTUATION or cp for cp in string)
def _is_chinese_char(cp: str) -> bool:
"""Checks whether CP is the codepoint of a CJK character."""
cp = ord(cp)
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)):
return True
return False
def normalize_chinese_number(text):
chinese_number = ['', '', '', '', '', '', '', '', '', '']
new_text = ''
for x in text:
if x in '0123456789':
x = chinese_number[0]
new_text += x
new_text = convert(new_text, 'zh-hans')
return new_text
def pre_chinese(text, max_words):
text = text.lower().replace(CHINESE_PUNCTUATION,
' ').replace(ENGLISH_PUNCTUATION, ' ')
text = re.sub(
r'\s{2,}',
' ',
text,
)
text = text.rstrip('\n')
text = text.strip(' ')[:max_words]
return text