cengal.text_processing.utf_bom_processing.versions.v_0.utf_bom_processing

View Source

 1#!/usr/bin/env python
 2# coding=utf-8
 3
 4# Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>
 5# 
 6# Licensed under the Apache License, Version 2.0 (the "License");
 7# you may not use this file except in compliance with the License.
 8# You may obtain a copy of the License at
 9# 
10#     http://www.apache.org/licenses/LICENSE-2.0
11# 
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17
18
19__all__ = ['WrongTextType', 'KNOWN_BOMS', 'determine_text_bom', 'remove_bom', 'determine_bom_encoding', 'decode_text_and_remove_all_wrong_symbols']
20
21
22"""
23Module Docstring
24Docstrings: http://www.python.org/dev/peps/pep-0257/
25"""
26
27
28__author__ = "ButenkoMS <gtalk@butenkoms.space>"
29__copyright__ = "Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>"
30__credits__ = ["ButenkoMS <gtalk@butenkoms.space>", ]
31__license__ = "Apache License, Version 2.0"
32__version__ = "4.4.1"
33__maintainer__ = "ButenkoMS <gtalk@butenkoms.space>"
34__email__ = "gtalk@butenkoms.space"
35# __status__ = "Prototype"
36__status__ = "Development"
37# __status__ = "Production"
38
39
40import platform, sys
41import codecs
42from typing import Optional, Union
43from cengal.text_processing.text_processing import Text, DEFAULT_ENCODING, normalize_text, removeprefix
44
45
46class WrongTextType(Exception):
47    pass
48
49
50KNOWN_BOMS = {
51    codecs.BOM_UTF8: 'utf-8',
52    codecs.BOM_UTF16_BE: 'utf-16be',
53    codecs.BOM_UTF16_LE: 'utf-16le',
54    codecs.BOM_UTF32_BE: 'utf-32be',
55    codecs.BOM_UTF32_LE: 'utf-32le',
56}
57KNOWN_BOMS_ORDER = [
58    codecs.BOM_UTF8,
59    codecs.BOM_UTF32_BE,
60    codecs.BOM_UTF32_LE,
61    codecs.BOM_UTF16_BE,
62    codecs.BOM_UTF16_LE,
63]
64
65
66def determine_text_bom(text: Union[bytes, bytearray]) -> Union[bytes, bytearray]:
67    if (not isinstance(text, bytes)) and (not isinstance(text, bytearray)):
68        raise WrongTextType
69    
70    bom_list = list()
71    
72    absent_bom = b''
73    
74    if isinstance(text, bytearray):
75        for bom in KNOWN_BOMS_ORDER:
76            bom_list.append(normalize_text(bom, bytearray))
77        
78        absent_bom = bytearray(absent_bom)
79    else:
80        bom_list = list(KNOWN_BOMS_ORDER)
81
82    for bom in bom_list:
83        if text.startswith(bom):
84            return bom
85    
86    return absent_bom
87
88
89def remove_bom(text: Union[bytes, bytearray], bom: Union[bytes, bytearray]) -> Union[bytes, bytearray]:
90    return removeprefix(text, bom)
91
92
93def determine_bom_encoding(bom: Union[bytes, bytearray]) -> Optional[str]:
94    bom = normalize_text(bom, bytes)
95    return KNOWN_BOMS.get(bom, None)
96
97
98def decode_text_and_remove_all_wrong_symbols(text: Union[bytes, bytearray], encoding: str) -> str:
99    return text.decode(encoding, 'replace')

class WrongTextType(builtins.Exception): View Source

47class WrongTextType(Exception):
48    pass

Common base class for all non-exit exceptions.

Inherited Members

builtins.Exception: Exception
builtins.BaseException: with_traceback; args

KNOWN_BOMS = {b'\xef\xbb\xbf': 'utf-8', b'\xfe\xff': 'utf-16be', b'\xff\xfe': 'utf-16le', b'\x00\x00\xfe\xff': 'utf-32be', b'\xff\xfe\x00\x00': 'utf-32le'}

def determine_text_bom(text: Union[bytes, bytearray]) -> Union[bytes, bytearray]: View Source

67def determine_text_bom(text: Union[bytes, bytearray]) -> Union[bytes, bytearray]:
68    if (not isinstance(text, bytes)) and (not isinstance(text, bytearray)):
69        raise WrongTextType
70    
71    bom_list = list()
72    
73    absent_bom = b''
74    
75    if isinstance(text, bytearray):
76        for bom in KNOWN_BOMS_ORDER:
77            bom_list.append(normalize_text(bom, bytearray))
78        
79        absent_bom = bytearray(absent_bom)
80    else:
81        bom_list = list(KNOWN_BOMS_ORDER)
82
83    for bom in bom_list:
84        if text.startswith(bom):
85            return bom
86    
87    return absent_bom

def remove_bom( text: Union[bytes, bytearray], bom: Union[bytes, bytearray]) -> Union[bytes, bytearray]: View Source

90def remove_bom(text: Union[bytes, bytearray], bom: Union[bytes, bytearray]) -> Union[bytes, bytearray]:
91    return removeprefix(text, bom)

def determine_bom_encoding(bom: Union[bytes, bytearray]) -> Union[str, NoneType]: View Source

94def determine_bom_encoding(bom: Union[bytes, bytearray]) -> Optional[str]:
95    bom = normalize_text(bom, bytes)
96    return KNOWN_BOMS.get(bom, None)

def decode_text_and_remove_all_wrong_symbols(text: Union[bytes, bytearray], encoding: str) -> str: View Source

 99def decode_text_and_remove_all_wrong_symbols(text: Union[bytes, bytearray], encoding: str) -> str:
100    return text.decode(encoding, 'replace')