cengal.text_processing.utf_bom_processing.versions.v_0.utf_bom_processing
1#!/usr/bin/env python 2# coding=utf-8 3 4# Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space> 5# 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17 18 19__all__ = ['WrongTextType', 'KNOWN_BOMS', 'determine_text_bom', 'remove_bom', 'determine_bom_encoding', 'decode_text_and_remove_all_wrong_symbols'] 20 21 22""" 23Module Docstring 24Docstrings: http://www.python.org/dev/peps/pep-0257/ 25""" 26 27 28__author__ = "ButenkoMS <gtalk@butenkoms.space>" 29__copyright__ = "Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>" 30__credits__ = ["ButenkoMS <gtalk@butenkoms.space>", ] 31__license__ = "Apache License, Version 2.0" 32__version__ = "4.4.1" 33__maintainer__ = "ButenkoMS <gtalk@butenkoms.space>" 34__email__ = "gtalk@butenkoms.space" 35# __status__ = "Prototype" 36__status__ = "Development" 37# __status__ = "Production" 38 39 40import platform, sys 41import codecs 42from typing import Optional, Union 43from cengal.text_processing.text_processing import Text, DEFAULT_ENCODING, normalize_text, removeprefix 44 45 46class WrongTextType(Exception): 47 pass 48 49 50KNOWN_BOMS = { 51 codecs.BOM_UTF8: 'utf-8', 52 codecs.BOM_UTF16_BE: 'utf-16be', 53 codecs.BOM_UTF16_LE: 'utf-16le', 54 codecs.BOM_UTF32_BE: 'utf-32be', 55 codecs.BOM_UTF32_LE: 'utf-32le', 56} 57KNOWN_BOMS_ORDER = [ 58 codecs.BOM_UTF8, 59 codecs.BOM_UTF32_BE, 60 codecs.BOM_UTF32_LE, 61 codecs.BOM_UTF16_BE, 62 codecs.BOM_UTF16_LE, 63] 64 65 66def determine_text_bom(text: Union[bytes, bytearray]) -> Union[bytes, bytearray]: 67 if (not isinstance(text, bytes)) and (not isinstance(text, bytearray)): 68 raise WrongTextType 69 70 bom_list = list() 71 72 absent_bom = b'' 73 74 if isinstance(text, bytearray): 75 for bom in KNOWN_BOMS_ORDER: 76 bom_list.append(normalize_text(bom, bytearray)) 77 78 absent_bom = bytearray(absent_bom) 79 else: 80 bom_list = list(KNOWN_BOMS_ORDER) 81 82 for bom in bom_list: 83 if text.startswith(bom): 84 return bom 85 86 return absent_bom 87 88 89def remove_bom(text: Union[bytes, bytearray], bom: Union[bytes, bytearray]) -> Union[bytes, bytearray]: 90 return removeprefix(text, bom) 91 92 93def determine_bom_encoding(bom: Union[bytes, bytearray]) -> Optional[str]: 94 bom = normalize_text(bom, bytes) 95 return KNOWN_BOMS.get(bom, None) 96 97 98def decode_text_and_remove_all_wrong_symbols(text: Union[bytes, bytearray], encoding: str) -> str: 99 return text.decode(encoding, 'replace')
class
WrongTextType(builtins.Exception):
Common base class for all non-exit exceptions.
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- args
KNOWN_BOMS =
{b'\xef\xbb\xbf': 'utf-8', b'\xfe\xff': 'utf-16be', b'\xff\xfe': 'utf-16le', b'\x00\x00\xfe\xff': 'utf-32be', b'\xff\xfe\x00\x00': 'utf-32le'}
def
determine_text_bom(text: Union[bytes, bytearray]) -> Union[bytes, bytearray]:
67def determine_text_bom(text: Union[bytes, bytearray]) -> Union[bytes, bytearray]: 68 if (not isinstance(text, bytes)) and (not isinstance(text, bytearray)): 69 raise WrongTextType 70 71 bom_list = list() 72 73 absent_bom = b'' 74 75 if isinstance(text, bytearray): 76 for bom in KNOWN_BOMS_ORDER: 77 bom_list.append(normalize_text(bom, bytearray)) 78 79 absent_bom = bytearray(absent_bom) 80 else: 81 bom_list = list(KNOWN_BOMS_ORDER) 82 83 for bom in bom_list: 84 if text.startswith(bom): 85 return bom 86 87 return absent_bom
def
remove_bom( text: Union[bytes, bytearray], bom: Union[bytes, bytearray]) -> Union[bytes, bytearray]:
def
determine_bom_encoding(bom: Union[bytes, bytearray]) -> Union[str, NoneType]:
def
decode_text_and_remove_all_wrong_symbols(text: Union[bytes, bytearray], encoding: str) -> str: