cengal.text_processing.text_processing.versions.v_0.processing

Module Docstring Docstrings: http://www.python.org/dev/peps/pep-0257/

View Source

  1#!/usr/bin/env python
  2# coding=utf-8
  3
  4# Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>
  5# 
  6# Licensed under the Apache License, Version 2.0 (the "License");
  7# you may not use this file except in compliance with the License.
  8# You may obtain a copy of the License at
  9# 
 10#     http://www.apache.org/licenses/LICENSE-2.0
 11# 
 12# Unless required by applicable law or agreed to in writing, software
 13# distributed under the License is distributed on an "AS IS" BASIS,
 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15# See the License for the specific language governing permissions and
 16# limitations under the License.
 17
 18
 19"""
 20Module Docstring
 21Docstrings: http://www.python.org/dev/peps/pep-0257/
 22"""
 23
 24
 25__author__ = "ButenkoMS <gtalk@butenkoms.space>"
 26__copyright__ = "Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>"
 27__credits__ = ["ButenkoMS <gtalk@butenkoms.space>", ]
 28__license__ = "Apache License, Version 2.0"
 29__version__ = "4.4.1"
 30__maintainer__ = "ButenkoMS <gtalk@butenkoms.space>"
 31__email__ = "gtalk@butenkoms.space"
 32# __status__ = "Prototype"
 33__status__ = "Development"
 34# __status__ = "Production"
 35
 36
 37__all__ = ['Text', 'BinText', 'DEFAULT_ENCODING', 'EncodingRequired', 'NotSupportedDataType', 'NotSupportedDesiredTextType', 
 38           'normalize_text', 'normalize_text_to_data', 'find_text', 'replace_slice', 'replace_text', 'normalize_line_separators', 
 39           'normalize_line_separators_and_tabs', 'removeprefix', 'removesuffix', 'to_identifier', 'remove_repetitive']
 40
 41
 42#!/usr/bin/env python
 43# coding=utf-8
 44
 45
 46from cengal.system import PYTHON_VERSION_INT
 47from typing import Optional, Tuple, Union, Type, Callable, Set, List
 48import string
 49import keyword
 50
 51
 52Text = Union[bytes, bytearray, str]
 53BinText = Union[bytes, bytearray]
 54DEFAULT_ENCODING = 'utf-8'
 55
 56
 57class EncodingRequired(Exception):
 58    pass
 59
 60
 61class NotSupportedDesiredTextType(Exception):
 62    pass
 63
 64
 65class NotSupportedDataType(Exception):
 66    pass
 67
 68
 69def _default_normalizer(text: Text, desired_type: Type, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
 70    raise NotImplementedError
 71
 72
 73def normalize_text(text: Text, desired_type: Type, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
 74    normalizer = normalizer or _default_normalizer
 75    
 76    need_to_use_normalizer = False
 77    if issubclass(desired_type, bytes):
 78        if isinstance(text, bytes):
 79            pass
 80        elif isinstance(text, bytearray):
 81            text = bytes(text)
 82        elif isinstance(text, str):
 83            if encoding:
 84                text = text.encode(encoding)
 85            else:
 86                raise EncodingRequired
 87        else:
 88            need_to_use_normalizer = True
 89    elif issubclass(desired_type, bytearray):
 90        if isinstance(text, bytearray):
 91            pass
 92        elif isinstance(text, bytes):
 93            text = bytearray(text)
 94        elif isinstance(text, str):
 95            if encoding:
 96                text = bytearray(text, encoding)
 97            else:
 98                raise EncodingRequired
 99        else:
100            need_to_use_normalizer = True
101    elif issubclass(desired_type, str):
102        if isinstance(text, str):
103            pass
104        elif isinstance(text, bytes) or isinstance(text, bytearray):
105            if encoding:
106                text = text.decode(encoding)
107            else:
108                raise EncodingRequired
109        else:
110            need_to_use_normalizer = True
111    else:
112        need_to_use_normalizer = True
113    
114    if need_to_use_normalizer:
115        text = normalizer(text, desired_type, encoding)
116
117    return text
118
119
120def normalize_text_to_data(data: Text, text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
121    data_type = type(data)
122    if not isinstance(text, data_type):
123        text = normalize_text(text, data_type, encoding, normalizer)
124    
125    return text
126
127
128def find_text(data: Text, text: Text, start: int = 0, stop: int = -1, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Optional[slice]:
129    text = normalize_text_to_data(data, text, encoding, normalizer)
130    start = data.find(text, start, stop)
131    if -1 == start:
132        return None
133    
134    return slice(start, start + len(text))
135
136
137def replace_slice(data: Text, place: slice, text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Tuple[Text, slice]:
138    text = normalize_text_to_data(data, text, encoding, normalizer)
139    l_text = data[:place.start]
140    r_text = data[place.stop:]
141    result_text = l_text + text + r_text
142    result_place = slice(place.start, place.start + len(text))
143    return result_text, result_place
144
145
146def replace_text(data: Text, old_text: Text, new_text: Text, count: int = -1, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Optional[Text]:
147    old_text = normalize_text_to_data(data, old_text, encoding, normalizer)
148    new_text = normalize_text_to_data(data, new_text, encoding, normalizer)
149    return data.replace(old_text, new_text, count)
150
151
152def normalize_line_separators(text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
153    lines = text.splitlines()
154    line_separator = '\n'
155    return normalize_text(line_separator, type(text), encoding, normalizer).join(lines)
156
157
158def normalize_line_separators_and_tabs(text: Text, tabsize=4, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
159    text = normalize_line_separators(text, encoding, normalizer)
160    return text.expandtabs(tabsize)
161
162
163def removeprefix(data: Text, prefix: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
164    prefix = normalize_text_to_data(data, prefix, encoding, normalizer)
165    if (3, 9) <= PYTHON_VERSION_INT:
166        return data.removeprefix(prefix)
167    else:
168        if data.startswith(prefix):
169            return data[len(prefix):]
170        else:
171            return data
172        
173
174def removesuffix(data: Text, suffix: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
175    suffix = normalize_text_to_data(data, suffix, encoding, normalizer)
176    if (3, 9) <= PYTHON_VERSION_INT:
177        return data.removesuffix(suffix)
178    else:
179        if data.endswith(suffix):
180            return data[:-len(suffix):]
181        else:
182            return data
183        
184
185def to_identifier(text: Text, need_to_remove_repetitive: bool = True, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
186    original_text = text
187    text = normalize_text_to_data(str(), text, encoding, normalizer)
188    valid_initial_chars = string.ascii_letters + '_'
189    valid_chars = valid_initial_chars + string.digits
190    text_chars: Set[str] = set(text)
191
192    trans = str.maketrans({
193        char: '_' for char in text_chars if char not in valid_chars
194    })
195    identifier = text.translate(trans)
196    if need_to_remove_repetitive:
197        identifier = remove_repetitive(identifier, '_')
198    
199    if not identifier or identifier[0] not in valid_initial_chars:
200        identifier = '_' + identifier
201
202    while keyword.iskeyword(identifier):
203        identifier += '_'
204
205    return normalize_text_to_data(original_text, text, encoding, normalizer)
206        
207
208def remove_repetitive(data: Text, sub_str: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
209    normalized_data: str = normalize_text_to_data(str(), data, encoding, normalizer)
210    normalized_sub_str: str = normalize_text_to_data(str(), sub_str, encoding, normalizer)
211    normalized_data.strip(normalized_sub_str)
212    split_normalized_data: List[str] = normalized_data.split(normalized_sub_str)
213    result: str = normalized_sub_str.join((piece for piece in split_normalized_data if piece))
214    return normalize_text_to_data(data, result, encoding, normalizer)

Text = typing.Union[bytes, bytearray, str]

BinText = typing.Union[bytes, bytearray]

DEFAULT_ENCODING = 'utf-8'

class EncodingRequired(builtins.Exception): View Source

58class EncodingRequired(Exception):
59    pass

Common base class for all non-exit exceptions.

Inherited Members

builtins.Exception: Exception
builtins.BaseException: with_traceback; args

class NotSupportedDataType(builtins.Exception): View Source

66class NotSupportedDataType(Exception):
67    pass

Common base class for all non-exit exceptions.

Inherited Members

builtins.Exception: Exception
builtins.BaseException: with_traceback; args

class NotSupportedDesiredTextType(builtins.Exception): View Source

62class NotSupportedDesiredTextType(Exception):
63    pass

Common base class for all non-exit exceptions.

Inherited Members

builtins.Exception: Exception
builtins.BaseException: with_traceback; args

def normalize_text( text: Union[bytes, bytearray, str], desired_type: Type, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]: View Source

 74def normalize_text(text: Text, desired_type: Type, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
 75    normalizer = normalizer or _default_normalizer
 76    
 77    need_to_use_normalizer = False
 78    if issubclass(desired_type, bytes):
 79        if isinstance(text, bytes):
 80            pass
 81        elif isinstance(text, bytearray):
 82            text = bytes(text)
 83        elif isinstance(text, str):
 84            if encoding:
 85                text = text.encode(encoding)
 86            else:
 87                raise EncodingRequired
 88        else:
 89            need_to_use_normalizer = True
 90    elif issubclass(desired_type, bytearray):
 91        if isinstance(text, bytearray):
 92            pass
 93        elif isinstance(text, bytes):
 94            text = bytearray(text)
 95        elif isinstance(text, str):
 96            if encoding:
 97                text = bytearray(text, encoding)
 98            else:
 99                raise EncodingRequired
100        else:
101            need_to_use_normalizer = True
102    elif issubclass(desired_type, str):
103        if isinstance(text, str):
104            pass
105        elif isinstance(text, bytes) or isinstance(text, bytearray):
106            if encoding:
107                text = text.decode(encoding)
108            else:
109                raise EncodingRequired
110        else:
111            need_to_use_normalizer = True
112    else:
113        need_to_use_normalizer = True
114    
115    if need_to_use_normalizer:
116        text = normalizer(text, desired_type, encoding)
117
118    return text

def normalize_text_to_data( data: Union[bytes, bytearray, str], text: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]: View Source

121def normalize_text_to_data(data: Text, text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
122    data_type = type(data)
123    if not isinstance(text, data_type):
124        text = normalize_text(text, data_type, encoding, normalizer)
125    
126    return text

def find_text( data: Union[bytes, bytearray, str], text: Union[bytes, bytearray, str], start: int = 0, stop: int = -1, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[slice, NoneType]: View Source

129def find_text(data: Text, text: Text, start: int = 0, stop: int = -1, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Optional[slice]:
130    text = normalize_text_to_data(data, text, encoding, normalizer)
131    start = data.find(text, start, stop)
132    if -1 == start:
133        return None
134    
135    return slice(start, start + len(text))

def replace_slice( data: Union[bytes, bytearray, str], place: slice, text: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Tuple[Union[bytes, bytearray, str], slice]: View Source

138def replace_slice(data: Text, place: slice, text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Tuple[Text, slice]:
139    text = normalize_text_to_data(data, text, encoding, normalizer)
140    l_text = data[:place.start]
141    r_text = data[place.stop:]
142    result_text = l_text + text + r_text
143    result_place = slice(place.start, place.start + len(text))
144    return result_text, result_place

def replace_text( data: Union[bytes, bytearray, str], old_text: Union[bytes, bytearray, str], new_text: Union[bytes, bytearray, str], count: int = -1, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str, NoneType]: View Source

147def replace_text(data: Text, old_text: Text, new_text: Text, count: int = -1, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Optional[Text]:
148    old_text = normalize_text_to_data(data, old_text, encoding, normalizer)
149    new_text = normalize_text_to_data(data, new_text, encoding, normalizer)
150    return data.replace(old_text, new_text, count)

def normalize_line_separators( text: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]: View Source

153def normalize_line_separators(text: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
154    lines = text.splitlines()
155    line_separator = '\n'
156    return normalize_text(line_separator, type(text), encoding, normalizer).join(lines)

def normalize_line_separators_and_tabs( text: Union[bytes, bytearray, str], tabsize=4, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]: View Source

159def normalize_line_separators_and_tabs(text: Text, tabsize=4, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
160    text = normalize_line_separators(text, encoding, normalizer)
161    return text.expandtabs(tabsize)

def removeprefix( data: Union[bytes, bytearray, str], prefix: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]: View Source

164def removeprefix(data: Text, prefix: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
165    prefix = normalize_text_to_data(data, prefix, encoding, normalizer)
166    if (3, 9) <= PYTHON_VERSION_INT:
167        return data.removeprefix(prefix)
168    else:
169        if data.startswith(prefix):
170            return data[len(prefix):]
171        else:
172            return data

def removesuffix( data: Union[bytes, bytearray, str], suffix: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]: View Source

175def removesuffix(data: Text, suffix: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
176    suffix = normalize_text_to_data(data, suffix, encoding, normalizer)
177    if (3, 9) <= PYTHON_VERSION_INT:
178        return data.removesuffix(suffix)
179    else:
180        if data.endswith(suffix):
181            return data[:-len(suffix):]
182        else:
183            return data

def to_identifier( text: Union[bytes, bytearray, str], need_to_remove_repetitive: bool = True, encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]: View Source

186def to_identifier(text: Text, need_to_remove_repetitive: bool = True, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
187    original_text = text
188    text = normalize_text_to_data(str(), text, encoding, normalizer)
189    valid_initial_chars = string.ascii_letters + '_'
190    valid_chars = valid_initial_chars + string.digits
191    text_chars: Set[str] = set(text)
192
193    trans = str.maketrans({
194        char: '_' for char in text_chars if char not in valid_chars
195    })
196    identifier = text.translate(trans)
197    if need_to_remove_repetitive:
198        identifier = remove_repetitive(identifier, '_')
199    
200    if not identifier or identifier[0] not in valid_initial_chars:
201        identifier = '_' + identifier
202
203    while keyword.iskeyword(identifier):
204        identifier += '_'
205
206    return normalize_text_to_data(original_text, text, encoding, normalizer)

def remove_repetitive( data: Union[bytes, bytearray, str], sub_str: Union[bytes, bytearray, str], encoding: Union[str, NoneType] = 'utf-8', normalizer: Union[Callable, NoneType] = None) -> Union[bytes, bytearray, str]: View Source

209def remove_repetitive(data: Text, sub_str: Text, encoding: Optional[str] = DEFAULT_ENCODING, normalizer: Optional[Callable] = None) -> Text:
210    normalized_data: str = normalize_text_to_data(str(), data, encoding, normalizer)
211    normalized_sub_str: str = normalize_text_to_data(str(), sub_str, encoding, normalizer)
212    normalized_data.strip(normalized_sub_str)
213    split_normalized_data: List[str] = normalized_data.split(normalized_sub_str)
214    result: str = normalized_sub_str.join((piece for piece in split_normalized_data if piece))
215    return normalize_text_to_data(data, result, encoding, normalizer)