cengal.text_processing.text_processing.versions.v_0.processing
Module Docstring Docstrings: http://www.python.org/dev/peps/pep-0257/
1#!/usr/bin/env python 2# coding=utf-8 3 4# Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space> 5# 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17 18 19""" 20Module Docstring 21Docstrings: http://www.python.org/dev/peps/pep-0257/ 22""" 23 24 25__author__ = "ButenkoMS <gtalk@butenkoms.space>" 26__copyright__ = "Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>" 27__credits__ = ["ButenkoMS <gtalk@butenkoms.space>", ] 28__license__ = "Apache License, Version 2.0" 29__version__ = "4.4.1" 30__maintainer__ = "ButenkoMS <gtalk@butenkoms.space>" 31__email__ = "gtalk@butenkoms.space" 32# __status__ = "Prototype" 33__status__ = "Development" 34# __status__ = "Production" 35 36 37__all__ = ['Text', 'BinText', 'DEFAULT_ENCODING', 'EncodingRequired', 'NotSupportedDataType', 'NotSupportedDesiredTextType', 38 'normalize_text', 'normalize_text_to_data', 'find_text', 'replace_slice', 'replace_text', 'normalize_line_separators', 39 'normalize_line_separators_and_tabs', 'removeprefix', 'removesuffix', 'to_identifier', 'remove_repetitive'] 40 41 42#!/usr/bin/env python 43# coding=utf-8 44 45 46from cengal.system import PYTHON_VERSION_INT 47from typing import Optional, Tuple, Union, Type, Callable, Set, List 48import string 49import keyword 50 51 52Text = Union[bytes, bytearray, str] 53BinText = Union[bytes, bytearray] 54DEFAULT_ENCODING = 'utf-8' 55 56 57class EncodingRequired(Exception): 58 pass 59 60 61class NotSupportedDesiredTextType(Exception): 62 pass 63 64 65class NotSupportedDataType(Exception): 66 pass 67 68 69def _default_normalizer(text: Text, desired_type: Type, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 70 raise NotImplementedError 71 72 73def normalize_text(text: Text, desired_type: Type, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 74 normalizer = normalizer or _default_normalizer 75 76 need_to_use_normalizer = False 77 if issubclass(desired_type, bytes): 78 if isinstance(text, bytes): 79 pass 80 elif isinstance(text, bytearray): 81 text = bytes(text) 82 elif isinstance(text, str): 83 if encoding: 84 text = text.encode(encoding) 85 else: 86 raise EncodingRequired 87 else: 88 need_to_use_normalizer = True 89 elif issubclass(desired_type, bytearray): 90 if isinstance(text, bytearray): 91 pass 92 elif isinstance(text, bytes): 93 text = bytearray(text) 94 elif isinstance(text, str): 95 if encoding: 96 text = bytearray(text, encoding) 97 else: 98 raise EncodingRequired 99 else: 100 need_to_use_normalizer = True 101 elif issubclass(desired_type, str): 102 if isinstance(text, str): 103 pass 104 elif isinstance(text, bytes) or isinstance(text, bytearray): 105 if encoding: 106 text = text.decode(encoding) 107 else: 108 raise EncodingRequired 109 else: 110 need_to_use_normalizer = True 111 else: 112 need_to_use_normalizer = True 113 114 if need_to_use_normalizer: 115 text = normalizer(text, desired_type, encoding) 116 117 return text 118 119 120def normalize_text_to_data(data: Text, text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 121 data_type = type(data) 122 if not isinstance(text, data_type): 123 text = normalize_text(text, data_type, encoding, normalizer) 124 125 return text 126 127 128def find_text(data: Text, text: Text, start: int = 0, stop: int = -1, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Optional[slice]: 129 text = normalize_text_to_data(data, text, encoding, normalizer) 130 start = data.find(text, start, stop) 131 if -1 == start: 132 return None 133 134 return slice(start, start + len(text)) 135 136 137def replace_slice(data: Text, place: slice, text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Tuple[Text, slice]: 138 text = normalize_text_to_data(data, text, encoding, normalizer) 139 l_text = data[:place.start] 140 r_text = data[place.stop:] 141 result_text = l_text + text + r_text 142 result_place = slice(place.start, place.start + len(text)) 143 return result_text, result_place 144 145 146def replace_text(data: Text, old_text: Text, new_text: Text, count: int = -1, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Optional[Text]: 147 old_text = normalize_text_to_data(data, old_text, encoding, normalizer) 148 new_text = normalize_text_to_data(data, new_text, encoding, normalizer) 149 return data.replace(old_text, new_text, count) 150 151 152def normalize_line_separators(text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 153 lines = text.splitlines() 154 line_separator = '\n' 155 return normalize_text(line_separator, type(text), encoding, normalizer).join(lines) 156 157 158def normalize_line_separators_and_tabs(text: Text, tabsize=4, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 159 text = normalize_line_separators(text, encoding, normalizer) 160 return text.expandtabs(tabsize) 161 162 163def removeprefix(data: Text, prefix: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 164 prefix = normalize_text_to_data(data, prefix, encoding, normalizer) 165 if (3, 9) <= PYTHON_VERSION_INT: 166 return data.removeprefix(prefix) 167 else: 168 if data.startswith(prefix): 169 return data[len(prefix):] 170 else: 171 return data 172 173 174def removesuffix(data: Text, suffix: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 175 suffix = normalize_text_to_data(data, suffix, encoding, normalizer) 176 if (3, 9) <= PYTHON_VERSION_INT: 177 return data.removesuffix(suffix) 178 else: 179 if data.endswith(suffix): 180 return data[:-len(suffix):] 181 else: 182 return data 183 184 185def to_identifier(text: Text, need_to_remove_repetitive: bool = True, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 186 original_text = text 187 text = normalize_text_to_data(str(), text, encoding, normalizer) 188 valid_initial_chars = string.ascii_letters + '_' 189 valid_chars = valid_initial_chars + string.digits 190 text_chars: Set[str] = set(text) 191 192 trans = str.maketrans({ 193 char: '_' for char in text_chars if char not in valid_chars 194 }) 195 identifier = text.translate(trans) 196 if need_to_remove_repetitive: 197 identifier = remove_repetitive(identifier, '_') 198 199 if not identifier or identifier[0] not in valid_initial_chars: 200 identifier = '_' + identifier 201 202 while keyword.iskeyword(identifier): 203 identifier += '_' 204 205 return normalize_text_to_data(original_text, text, encoding, normalizer) 206 207 208def remove_repetitive(data: Text, sub_str: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 209 normalized_data: str = normalize_text_to_data(str(), data, encoding, normalizer) 210 normalized_sub_str: str = normalize_text_to_data(str(), sub_str, encoding, normalizer) 211 normalized_data.strip(normalized_sub_str) 212 split_normalized_data: List[str] = normalized_data.split(normalized_sub_str) 213 result: str = normalized_sub_str.join((piece for piece in split_normalized_data if piece)) 214 return normalize_text_to_data(data, result, encoding, normalizer)
Text =
typing.Union[bytes, bytearray, str]
BinText =
typing.Union[bytes, bytearray]
DEFAULT_ENCODING =
'utf-8'
class
EncodingRequired(builtins.Exception):
Common base class for all non-exit exceptions.
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- args
class
NotSupportedDataType(builtins.Exception):
Common base class for all non-exit exceptions.
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- args
class
NotSupportedDesiredTextType(builtins.Exception):
Common base class for all non-exit exceptions.
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- args
def
normalize_text( text: Union[bytes, bytearray, str], desired_type: Type, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]:
74def normalize_text(text: Text, desired_type: Type, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 75 normalizer = normalizer or _default_normalizer 76 77 need_to_use_normalizer = False 78 if issubclass(desired_type, bytes): 79 if isinstance(text, bytes): 80 pass 81 elif isinstance(text, bytearray): 82 text = bytes(text) 83 elif isinstance(text, str): 84 if encoding: 85 text = text.encode(encoding) 86 else: 87 raise EncodingRequired 88 else: 89 need_to_use_normalizer = True 90 elif issubclass(desired_type, bytearray): 91 if isinstance(text, bytearray): 92 pass 93 elif isinstance(text, bytes): 94 text = bytearray(text) 95 elif isinstance(text, str): 96 if encoding: 97 text = bytearray(text, encoding) 98 else: 99 raise EncodingRequired 100 else: 101 need_to_use_normalizer = True 102 elif issubclass(desired_type, str): 103 if isinstance(text, str): 104 pass 105 elif isinstance(text, bytes) or isinstance(text, bytearray): 106 if encoding: 107 text = text.decode(encoding) 108 else: 109 raise EncodingRequired 110 else: 111 need_to_use_normalizer = True 112 else: 113 need_to_use_normalizer = True 114 115 if need_to_use_normalizer: 116 text = normalizer(text, desired_type, encoding) 117 118 return text
def
normalize_text_to_data( data: Union[bytes, bytearray, str], text: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]:
def
find_text( data: Union[bytes, bytearray, str], text: Union[bytes, bytearray, str], start: int = 0, stop: int = -1, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[slice, NoneType]:
129def find_text(data: Text, text: Text, start: int = 0, stop: int = -1, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Optional[slice]: 130 text = normalize_text_to_data(data, text, encoding, normalizer) 131 start = data.find(text, start, stop) 132 if -1 == start: 133 return None 134 135 return slice(start, start + len(text))
def
replace_slice( data: Union[bytes, bytearray, str], place: slice, text: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Tuple[Union[bytes, bytearray, str], slice]:
138def replace_slice(data: Text, place: slice, text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Tuple[Text, slice]: 139 text = normalize_text_to_data(data, text, encoding, normalizer) 140 l_text = data[:place.start] 141 r_text = data[place.stop:] 142 result_text = l_text + text + r_text 143 result_place = slice(place.start, place.start + len(text)) 144 return result_text, result_place
def
replace_text( data: Union[bytes, bytearray, str], old_text: Union[bytes, bytearray, str], new_text: Union[bytes, bytearray, str], count: int = -1, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str, NoneType]:
147def replace_text(data: Text, old_text: Text, new_text: Text, count: int = -1, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Optional[Text]: 148 old_text = normalize_text_to_data(data, old_text, encoding, normalizer) 149 new_text = normalize_text_to_data(data, new_text, encoding, normalizer) 150 return data.replace(old_text, new_text, count)
def
normalize_line_separators( text: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]:
def
normalize_line_separators_and_tabs( text: Union[bytes, bytearray, str], tabsize=4, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]:
def
removeprefix( data: Union[bytes, bytearray, str], prefix: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]:
164def removeprefix(data: Text, prefix: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 165 prefix = normalize_text_to_data(data, prefix, encoding, normalizer) 166 if (3, 9) <= PYTHON_VERSION_INT: 167 return data.removeprefix(prefix) 168 else: 169 if data.startswith(prefix): 170 return data[len(prefix):] 171 else: 172 return data
def
removesuffix( data: Union[bytes, bytearray, str], suffix: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]:
175def removesuffix(data: Text, suffix: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 176 suffix = normalize_text_to_data(data, suffix, encoding, normalizer) 177 if (3, 9) <= PYTHON_VERSION_INT: 178 return data.removesuffix(suffix) 179 else: 180 if data.endswith(suffix): 181 return data[:-len(suffix):] 182 else: 183 return data
def
to_identifier( text: Union[bytes, bytearray, str], need_to_remove_repetitive: bool = True, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]:
186def to_identifier(text: Text, need_to_remove_repetitive: bool = True, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 187 original_text = text 188 text = normalize_text_to_data(str(), text, encoding, normalizer) 189 valid_initial_chars = string.ascii_letters + '_' 190 valid_chars = valid_initial_chars + string.digits 191 text_chars: Set[str] = set(text) 192 193 trans = str.maketrans({ 194 char: '_' for char in text_chars if char not in valid_chars 195 }) 196 identifier = text.translate(trans) 197 if need_to_remove_repetitive: 198 identifier = remove_repetitive(identifier, '_') 199 200 if not identifier or identifier[0] not in valid_initial_chars: 201 identifier = '_' + identifier 202 203 while keyword.iskeyword(identifier): 204 identifier += '_' 205 206 return normalize_text_to_data(original_text, text, encoding, normalizer)
def
remove_repetitive( data: Union[bytes, bytearray, str], sub_str: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]:
209def remove_repetitive(data: Text, sub_str: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text: 210 normalized_data: str = normalize_text_to_data(str(), data, encoding, normalizer) 211 normalized_sub_str: str = normalize_text_to_data(str(), sub_str, encoding, normalizer) 212 normalized_data.strip(normalized_sub_str) 213 split_normalized_data: List[str] = normalized_data.split(normalized_sub_str) 214 result: str = normalized_sub_str.join((piece for piece in split_normalized_data if piece)) 215 return normalize_text_to_data(data, result, encoding, normalizer)