cengal.text_processing.help_tools.versions.v_0.help_tools
1#!/usr/bin/env python 2# coding=utf-8 3 4# Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space> 5# 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17 18from cengal.code_flow_control.smart_values.versions.v_0 import ResultExistence 19import string 20from cengal.data_manipulation.conversion.binary import ubyte_to_bytes 21from cengal.data_manipulation.conversion.sequence import get_slice_from_array 22from typing import Tuple, Union 23 24""" 25Module Docstring 26Docstrings: http://www.python.org/dev/peps/pep-0257/ 27""" 28 29__author__ = "ButenkoMS <gtalk@butenkoms.space>" 30__copyright__ = "Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>" 31__credits__ = ["ButenkoMS <gtalk@butenkoms.space>", ] 32__license__ = "Apache License, Version 2.0" 33__version__ = "4.4.1" 34__maintainer__ = "ButenkoMS <gtalk@butenkoms.space>" 35__email__ = "gtalk@butenkoms.space" 36# __status__ = "Prototype" 37__status__ = "Development" 38# __status__ = "Production" 39 40 41class AbstractSetOfSymbols: 42 def __init__(self): 43 self.ascii_word_delimiters = None 44 self.ascii_word_delimiters__bytes = None 45 self.ascii_word_delimiters__set = None 46 self.ascii_word_delimiters__set_bytes = None 47 48 # without "_" symbol 49 self.ascii_modern_word_delimiters = None 50 self.ascii_modern_word_delimiters__bytes = None 51 self.ascii_modern_word_delimiters__set = None 52 self.ascii_modern_word_delimiters__set_bytes = None 53 54 55class SetOfSymbols(AbstractSetOfSymbols): 56 def __init__(self): 57 super(SetOfSymbols, self).__init__() 58 self.ascii_word_delimiters = string.punctuation + string.whitespace 59 self.ascii_word_delimiters__bytes = self.ascii_word_delimiters.encode() 60 self.ascii_word_delimiters__set = set(self.ascii_word_delimiters) 61 self.ascii_word_delimiters__set_bytes = set() 62 for delim_char in self.ascii_word_delimiters__set: 63 self.ascii_word_delimiters__set_bytes.add(delim_char.encode()) 64 65 self.ascii_modern_word_delimiters__set = set(self.ascii_word_delimiters__set) 66 self.ascii_modern_word_delimiters__set.remove('_') 67 self.ascii_modern_word_delimiters = ''.join(self.ascii_modern_word_delimiters__set) 68 self.ascii_modern_word_delimiters__bytes = self.ascii_modern_word_delimiters.encode() 69 self.ascii_modern_word_delimiters__set_bytes = set() 70 for delim_char in self.ascii_modern_word_delimiters__set: 71 self.ascii_modern_word_delimiters__set_bytes.add(delim_char.encode()) 72 73 74SET_OF_SYMBOLS = SetOfSymbols() 75 76 77def get_text_in_brackets(data, left_b, right_b): 78 # TODO: если в строке не найдена закрывающая скобка - последний символ строки будет удален. Проверить, не ломает ли 79 # такое поведение алгоритмы в частности в upk-утилитах и в UCB-компиляторе 80 left_offset = data.find(left_b) 81 data = data[left_offset + len(left_b):] 82 right_offset = data.find(right_b) 83 data = data[:right_offset] 84 return data 85 86 87def get_text_in_brackets_offset(data, left_b, right_b, offset=0): 88 # TODO: если в строке не найдена закрывающая скобка - последний символ строки будет удален. Проверить, не ломает ли 89 # такое поведение алгоритмы в частности в upk-утилитах и в UCB-компиляторе 90 result = None 91 result_data = None 92 result_offset = None 93 if offset > 0: 94 data = data[offset:] 95 left_b_len = len(left_b) 96 right_b_len = len(right_b) 97 left_offset = data.find(left_b) 98 data = data[left_offset + left_b_len:] 99 right_offset = data.find(right_b) 100 result_data = data[:right_offset] 101 result_offset = offset + left_offset + left_b_len + right_offset + right_b_len 102 result = (result_data, result_offset) 103 return result 104 105 106def detach_slice_from_string(string, substring, offset=0): 107 slice_index = string.index(substring, offset) 108 slice_size = len(substring) 109 end_of_slice = slice_index + slice_size 110 string_before = string[:slice_index] 111 string_slice = string[slice_index:end_of_slice] 112 string_after = string[end_of_slice:] 113 result = (string_before, string_slice, string_after, end_of_slice) 114 return result 115 116 117def detach_slice_from_string__case_insensitive(string, substring, offset=0): 118 lowercase_string = string.lower() 119 lowercase_substring = substring.lower() 120 slice_index = lowercase_string.index(lowercase_substring, offset) 121 slice_size = len(substring) 122 end_of_slice = slice_index + slice_size 123 string_before = string[:slice_index] 124 string_slice = string[slice_index:end_of_slice] 125 string_after = string[end_of_slice:] 126 result = (string_before, string_slice, string_after, end_of_slice) 127 return result 128 129 130def find_substring(full_string: bytes, substring: bytes, offset: int=0)\ 131 ->Tuple[Union[None, int], Union[None, int]]: 132 start_index = full_string.find(substring, offset) 133 word_start = True 134 if 0 > start_index: 135 word_start = False 136 137 end_index = start_index + len(substring) 138 139 if not word_start: 140 start_index = None 141 end_index = None 142 143 return start_index, end_index 144 145 146def find_substring_full_word(full_string: bytes, substring: bytes, offset: int=0, smart_word_bounds: bool=False)\ 147 ->Tuple[ResultExistence, ResultExistence]: 148 start_index = ResultExistence(False, 0) 149 end_index = ResultExistence(False, None) 150 151 while start_index.result is not None: 152 start_index, end_index = find_substring_full_word__one_shot(full_string, substring, offset, smart_word_bounds) 153 if start_index.result is not None: 154 # substring was found 155 if start_index and end_index: 156 # full word was found 157 break 158 else: 159 # need to continue search from the new offset 160 offset = end_index.result 161 else: 162 # substring wasn't found 163 break 164 165 if start_index: 166 start_index = start_index.result 167 else: 168 start_index = None 169 170 if end_index: 171 end_index = end_index.result 172 else: 173 end_index = None 174 175 return start_index, end_index 176 177 178class FindSubstringErrorFullStringCanNotBeEmpty(Exception): 179 pass 180 181 182class FindSubstringErrorSubstringCanNotBeEmpty(Exception): 183 pass 184 185 186def find_substring_full_word__one_shot(full_string: bytes, substring: bytes, offset: int=0, 187 smart_word_bounds: bool=False)\ 188 ->Tuple[ResultExistence, ResultExistence]: 189 delimiters = SET_OF_SYMBOLS.ascii_modern_word_delimiters__set_bytes 190 191 word_start = ResultExistence(False, None) 192 word_end = ResultExistence(False, None) 193 194 if not full_string: 195 return word_start, word_end 196 if not substring: 197 return word_start, word_end 198 199 is_word_start_is_delimiter = False 200 is_word_end_is_delimiter = False 201 if smart_word_bounds: 202 if ubyte_to_bytes(substring[0]) in delimiters: 203 is_word_start_is_delimiter = True 204 if ubyte_to_bytes(substring[-1]) in delimiters: 205 is_word_end_is_delimiter = True 206 207 word_start = ResultExistence(False, full_string.find(substring, offset)) 208 word_end = ResultExistence(False, None) 209 210 if 0 > word_start.result: 211 word_start.existence = False 212 word_start.result = None 213 else: 214 if is_word_start_is_delimiter: 215 word_start.existence = True 216 elif 0 == word_start.result: 217 word_start.existence = True 218 elif ubyte_to_bytes(full_string[word_start.result - 1]) in delimiters: 219 word_start.existence = True 220 221 if word_start.result is not None: 222 word_end.result = word_start.result + len(substring) 223 full_string_len = len(full_string) 224 if is_word_end_is_delimiter: 225 word_start.existence = True 226 elif word_end.result > full_string_len: 227 word_end.existence = False 228 elif word_end.result == full_string_len: 229 word_end.existence = True 230 elif ubyte_to_bytes(full_string[word_end.result]) in delimiters: 231 word_end.existence = True 232 233 return word_start, word_end 234 235 236def check_is_slice_is_in_string(substring: bytes, full_string: bytes, check_whole_word=False, 237 smart_word_bounds: bool=False): 238 if check_whole_word: 239 word_start, word_end = find_substring_full_word(full_string, substring, smart_word_bounds=smart_word_bounds) 240 if (word_start is not None) and (word_end is not None): 241 return True 242 else: 243 return False 244 else: 245 return substring in full_string 246 247 248def check_is_slice_is_in_string__case_insensitive(substring: bytes, full_string: bytes, check_whole_word=False, 249 smart_word_bounds: bool=False): 250 return check_is_slice_is_in_string(substring.lower(), full_string.lower(), check_whole_word, smart_word_bounds) 251 252 253def detach_all_slices_from_string(string, substring, function__detach=None, function__check_is_in=None): 254 """ 255 :param string: input string 256 :param substring: desired substring 257 :param function__detach: detach_slice_from_string (when None) or detach_slice_from_string__case_insensitive 258 :param function__check_is_in: check_is_slice_is_in_string (when None) or 259 check_is_slice_is_in_string__case_insensitive 260 :return: ([(original_string_part_0, string_slice), (original_string_part_1, string_slice), ..., 261 (original_string_part_N, string_slice)], string_after) 262 """ 263 function__detach = function__detach or detach_slice_from_string 264 function__check_is_in = function__check_is_in or check_is_slice_is_in_string 265 266 result_list = list() 267 result = function__detach(string, substring) 268 last_string_after = result[2] 269 new_result = (result[0], result[1]) 270 result_list.append(new_result) 271 while function__check_is_in(substring, result[2]): 272 result = function__detach(result[2], substring) 273 last_string_after = result[2] 274 new_result = (result[0], result[1]) 275 result_list.append(new_result) 276 result = (result_list, last_string_after) 277 return result 278 279 280def detach_all_slices_from_string__case_insensitive(string, substring): 281 return detach_all_slices_from_string( 282 string, substring, detach_slice_from_string__case_insensitive, check_is_slice_is_in_string__case_insensitive) 283 284 285def is_printable(s, codec='utf8'): 286 try: 287 s.decode(codec) 288 except UnicodeDecodeError: 289 return False 290 else: 291 return True 292 293 294def bytes_to_printable(bytes_data): 295 result = str(bytes_data)[2:-1] 296 return result 297 298 299def levenshtein_distance(a, b): 300 "Calculates the Levenshtein distance between a and b." 301 n, m = len(a), len(b) 302 if n > m: 303 # Make sure n <= m, to use O(min(n,m)) space 304 a, b = b, a 305 n, m = m, n 306 307 current_row = range(n+1) # Keep current and previous row, not entire matrix 308 for i in range(1, m+1): 309 previous_row, current_row = current_row, [i]+[0]*n 310 for j in range(1,n+1): 311 add, delete, change = previous_row[j]+1, current_row[j-1]+1, previous_row[j-1] 312 if a[j-1] != b[i-1]: 313 change += 1 314 current_row[j] = min(add, delete, change) 315 316 return current_row[n] 317 318 319def un_escape_str(text: str) -> str: 320 return bytes(text, 'utf-8').decode('unicode_escape')
class
AbstractSetOfSymbols:
42class AbstractSetOfSymbols: 43 def __init__(self): 44 self.ascii_word_delimiters = None 45 self.ascii_word_delimiters__bytes = None 46 self.ascii_word_delimiters__set = None 47 self.ascii_word_delimiters__set_bytes = None 48 49 # without "_" symbol 50 self.ascii_modern_word_delimiters = None 51 self.ascii_modern_word_delimiters__bytes = None 52 self.ascii_modern_word_delimiters__set = None 53 self.ascii_modern_word_delimiters__set_bytes = None
56class SetOfSymbols(AbstractSetOfSymbols): 57 def __init__(self): 58 super(SetOfSymbols, self).__init__() 59 self.ascii_word_delimiters = string.punctuation + string.whitespace 60 self.ascii_word_delimiters__bytes = self.ascii_word_delimiters.encode() 61 self.ascii_word_delimiters__set = set(self.ascii_word_delimiters) 62 self.ascii_word_delimiters__set_bytes = set() 63 for delim_char in self.ascii_word_delimiters__set: 64 self.ascii_word_delimiters__set_bytes.add(delim_char.encode()) 65 66 self.ascii_modern_word_delimiters__set = set(self.ascii_word_delimiters__set) 67 self.ascii_modern_word_delimiters__set.remove('_') 68 self.ascii_modern_word_delimiters = ''.join(self.ascii_modern_word_delimiters__set) 69 self.ascii_modern_word_delimiters__bytes = self.ascii_modern_word_delimiters.encode() 70 self.ascii_modern_word_delimiters__set_bytes = set() 71 for delim_char in self.ascii_modern_word_delimiters__set: 72 self.ascii_modern_word_delimiters__set_bytes.add(delim_char.encode())
SET_OF_SYMBOLS =
<SetOfSymbols object>
def
get_text_in_brackets(data, left_b, right_b):
78def get_text_in_brackets(data, left_b, right_b): 79 # TODO: если в строке не найдена закрывающая скобка - последний символ строки будет удален. Проверить, не ломает ли 80 # такое поведение алгоритмы в частности в upk-утилитах и в UCB-компиляторе 81 left_offset = data.find(left_b) 82 data = data[left_offset + len(left_b):] 83 right_offset = data.find(right_b) 84 data = data[:right_offset] 85 return data
def
get_text_in_brackets_offset(data, left_b, right_b, offset=0):
88def get_text_in_brackets_offset(data, left_b, right_b, offset=0): 89 # TODO: если в строке не найдена закрывающая скобка - последний символ строки будет удален. Проверить, не ломает ли 90 # такое поведение алгоритмы в частности в upk-утилитах и в UCB-компиляторе 91 result = None 92 result_data = None 93 result_offset = None 94 if offset > 0: 95 data = data[offset:] 96 left_b_len = len(left_b) 97 right_b_len = len(right_b) 98 left_offset = data.find(left_b) 99 data = data[left_offset + left_b_len:] 100 right_offset = data.find(right_b) 101 result_data = data[:right_offset] 102 result_offset = offset + left_offset + left_b_len + right_offset + right_b_len 103 result = (result_data, result_offset) 104 return result
def
detach_slice_from_string(string, substring, offset=0):
107def detach_slice_from_string(string, substring, offset=0): 108 slice_index = string.index(substring, offset) 109 slice_size = len(substring) 110 end_of_slice = slice_index + slice_size 111 string_before = string[:slice_index] 112 string_slice = string[slice_index:end_of_slice] 113 string_after = string[end_of_slice:] 114 result = (string_before, string_slice, string_after, end_of_slice) 115 return result
def
detach_slice_from_string__case_insensitive(string, substring, offset=0):
118def detach_slice_from_string__case_insensitive(string, substring, offset=0): 119 lowercase_string = string.lower() 120 lowercase_substring = substring.lower() 121 slice_index = lowercase_string.index(lowercase_substring, offset) 122 slice_size = len(substring) 123 end_of_slice = slice_index + slice_size 124 string_before = string[:slice_index] 125 string_slice = string[slice_index:end_of_slice] 126 string_after = string[end_of_slice:] 127 result = (string_before, string_slice, string_after, end_of_slice) 128 return result
def
find_substring( full_string: bytes, substring: bytes, offset: int = 0) -> Tuple[Union[NoneType, int], Union[NoneType, int]]:
131def find_substring(full_string: bytes, substring: bytes, offset: int=0)\ 132 ->Tuple[Union[None, int], Union[None, int]]: 133 start_index = full_string.find(substring, offset) 134 word_start = True 135 if 0 > start_index: 136 word_start = False 137 138 end_index = start_index + len(substring) 139 140 if not word_start: 141 start_index = None 142 end_index = None 143 144 return start_index, end_index
def
find_substring_full_word( full_string: bytes, substring: bytes, offset: int = 0, smart_word_bounds: bool = False) -> Tuple[cengal.code_flow_control.smart_values.versions.v_0.result_types.ResultExistence, cengal.code_flow_control.smart_values.versions.v_0.result_types.ResultExistence]:
147def find_substring_full_word(full_string: bytes, substring: bytes, offset: int=0, smart_word_bounds: bool=False)\ 148 ->Tuple[ResultExistence, ResultExistence]: 149 start_index = ResultExistence(False, 0) 150 end_index = ResultExistence(False, None) 151 152 while start_index.result is not None: 153 start_index, end_index = find_substring_full_word__one_shot(full_string, substring, offset, smart_word_bounds) 154 if start_index.result is not None: 155 # substring was found 156 if start_index and end_index: 157 # full word was found 158 break 159 else: 160 # need to continue search from the new offset 161 offset = end_index.result 162 else: 163 # substring wasn't found 164 break 165 166 if start_index: 167 start_index = start_index.result 168 else: 169 start_index = None 170 171 if end_index: 172 end_index = end_index.result 173 else: 174 end_index = None 175 176 return start_index, end_index
class
FindSubstringErrorFullStringCanNotBeEmpty(builtins.Exception):
Common base class for all non-exit exceptions.
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- args
class
FindSubstringErrorSubstringCanNotBeEmpty(builtins.Exception):
Common base class for all non-exit exceptions.
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- args
def
find_substring_full_word__one_shot( full_string: bytes, substring: bytes, offset: int = 0, smart_word_bounds: bool = False) -> Tuple[cengal.code_flow_control.smart_values.versions.v_0.result_types.ResultExistence, cengal.code_flow_control.smart_values.versions.v_0.result_types.ResultExistence]:
187def find_substring_full_word__one_shot(full_string: bytes, substring: bytes, offset: int=0, 188 smart_word_bounds: bool=False)\ 189 ->Tuple[ResultExistence, ResultExistence]: 190 delimiters = SET_OF_SYMBOLS.ascii_modern_word_delimiters__set_bytes 191 192 word_start = ResultExistence(False, None) 193 word_end = ResultExistence(False, None) 194 195 if not full_string: 196 return word_start, word_end 197 if not substring: 198 return word_start, word_end 199 200 is_word_start_is_delimiter = False 201 is_word_end_is_delimiter = False 202 if smart_word_bounds: 203 if ubyte_to_bytes(substring[0]) in delimiters: 204 is_word_start_is_delimiter = True 205 if ubyte_to_bytes(substring[-1]) in delimiters: 206 is_word_end_is_delimiter = True 207 208 word_start = ResultExistence(False, full_string.find(substring, offset)) 209 word_end = ResultExistence(False, None) 210 211 if 0 > word_start.result: 212 word_start.existence = False 213 word_start.result = None 214 else: 215 if is_word_start_is_delimiter: 216 word_start.existence = True 217 elif 0 == word_start.result: 218 word_start.existence = True 219 elif ubyte_to_bytes(full_string[word_start.result - 1]) in delimiters: 220 word_start.existence = True 221 222 if word_start.result is not None: 223 word_end.result = word_start.result + len(substring) 224 full_string_len = len(full_string) 225 if is_word_end_is_delimiter: 226 word_start.existence = True 227 elif word_end.result > full_string_len: 228 word_end.existence = False 229 elif word_end.result == full_string_len: 230 word_end.existence = True 231 elif ubyte_to_bytes(full_string[word_end.result]) in delimiters: 232 word_end.existence = True 233 234 return word_start, word_end
def
check_is_slice_is_in_string( substring: bytes, full_string: bytes, check_whole_word=False, smart_word_bounds: bool = False):
237def check_is_slice_is_in_string(substring: bytes, full_string: bytes, check_whole_word=False, 238 smart_word_bounds: bool=False): 239 if check_whole_word: 240 word_start, word_end = find_substring_full_word(full_string, substring, smart_word_bounds=smart_word_bounds) 241 if (word_start is not None) and (word_end is not None): 242 return True 243 else: 244 return False 245 else: 246 return substring in full_string
def
check_is_slice_is_in_string__case_insensitive( substring: bytes, full_string: bytes, check_whole_word=False, smart_word_bounds: bool = False):
def
detach_all_slices_from_string(string, substring, function__detach=None, function__check_is_in=None):
254def detach_all_slices_from_string(string, substring, function__detach=None, function__check_is_in=None): 255 """ 256 :param string: input string 257 :param substring: desired substring 258 :param function__detach: detach_slice_from_string (when None) or detach_slice_from_string__case_insensitive 259 :param function__check_is_in: check_is_slice_is_in_string (when None) or 260 check_is_slice_is_in_string__case_insensitive 261 :return: ([(original_string_part_0, string_slice), (original_string_part_1, string_slice), ..., 262 (original_string_part_N, string_slice)], string_after) 263 """ 264 function__detach = function__detach or detach_slice_from_string 265 function__check_is_in = function__check_is_in or check_is_slice_is_in_string 266 267 result_list = list() 268 result = function__detach(string, substring) 269 last_string_after = result[2] 270 new_result = (result[0], result[1]) 271 result_list.append(new_result) 272 while function__check_is_in(substring, result[2]): 273 result = function__detach(result[2], substring) 274 last_string_after = result[2] 275 new_result = (result[0], result[1]) 276 result_list.append(new_result) 277 result = (result_list, last_string_after) 278 return result
:param string: input string :param substring: desired substring :param function__detach: detach_slice_from_string (when None) or detach_slice_from_string__case_insensitive :param function__check_is_in: check_is_slice_is_in_string (when None) or check_is_slice_is_in_string__case_insensitive :return: ([(original_string_part_0, string_slice), (original_string_part_1, string_slice), ..., (original_string_part_N, string_slice)], string_after)
def
detach_all_slices_from_string__case_insensitive(string, substring):
def
is_printable(s, codec='utf8'):
def
bytes_to_printable(bytes_data):
def
levenshtein_distance(a, b):
300def levenshtein_distance(a, b): 301 "Calculates the Levenshtein distance between a and b." 302 n, m = len(a), len(b) 303 if n > m: 304 # Make sure n <= m, to use O(min(n,m)) space 305 a, b = b, a 306 n, m = m, n 307 308 current_row = range(n+1) # Keep current and previous row, not entire matrix 309 for i in range(1, m+1): 310 previous_row, current_row = current_row, [i]+[0]*n 311 for j in range(1,n+1): 312 add, delete, change = previous_row[j]+1, current_row[j-1]+1, previous_row[j-1] 313 if a[j-1] != b[i-1]: 314 change += 1 315 current_row[j] = min(add, delete, change) 316 317 return current_row[n]
Calculates the Levenshtein distance between a and b.
def
un_escape_str(text: str) -> str: