cengal.text_processing.encoding_detection.versions.v_0.encoding_detection
1#!/usr/bin/env python 2# coding=utf-8 3 4# Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space> 5# 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17 18 19__all__ = [ 20 'decode', 21 'detect_and_decode', 22 'is_utf8_text', 23 'is_text_is_7bit_utf8_compatible', 24 'is_probably_utf8', 25 'is_utf8', 26] 27 28 29from typing import Tuple, Union 30import cchardet as chardet 31from cengal.modules_management.alternative_import import alt_import 32with alt_import('cchardet') as chardet: 33 if chardet is None: 34 CHARDET_PRESENT: bool = False 35 else: 36 CHARDET_PRESENT = True 37 38from charset_normalizer import detect as cn_detect 39from cengal.text_processing.text_processing import Text, normalize_text 40from cengal.text_processing.utf_bom_processing import * 41 42 43""" 44Module Docstring 45Docstrings: http://www.python.org/dev/peps/pep-0257/ 46""" 47 48__author__ = "ButenkoMS <gtalk@butenkoms.space>" 49__copyright__ = "Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>" 50__credits__ = ["ButenkoMS <gtalk@butenkoms.space>", ] 51__license__ = "Apache License, Version 2.0" 52__version__ = "4.4.1" 53__maintainer__ = "ButenkoMS <gtalk@butenkoms.space>" 54__email__ = "gtalk@butenkoms.space" 55# __status__ = "Prototype" 56__status__ = "Development" 57# __status__ = "Production" 58 59 60def detect_and_decode(text: Union[bytes, bytearray], detect_as_utf8_when_possible: bool = True, check_text_for_utf8_compliance: bool = True) -> Tuple[str, str, bytes]: 61 if not text: 62 return str(), 'utf-8', bytes() 63 64 text = normalize_text(text, bytes) 65 possible_utf_bom = determine_text_bom(text) 66 text = remove_bom(text, possible_utf_bom) 67 possible_encoding = determine_bom_encoding(possible_utf_bom) 68 if possible_encoding is not None: 69 return decode_text_and_remove_all_wrong_symbols(text, possible_encoding), possible_encoding, possible_utf_bom 70 else: 71 try_charset_normalizer = False 72 try: 73 if CHARDET_PRESENT: 74 detection = chardet.detect(text) 75 else: 76 try_charset_normalizer = True 77 except LookupError: 78 try_charset_normalizer = True 79 80 if try_charset_normalizer: 81 detection = cn_detect(text) 82 83 encoding = detection["encoding"] 84 if detect_as_utf8_when_possible: 85 if check_text_for_utf8_compliance: 86 result_encoding = 'utf-8' if is_utf8_text(encoding, text) else encoding 87 else: 88 result_encoding = 'utf-8' if (is_utf8(encoding) or is_probably_utf8(encoding)) else encoding 89 else: 90 result_encoding = encoding 91 92 bom_bytes = bytes() 93 return text.decode(encoding), result_encoding, bom_bytes 94 95 96utf8_compatible_encodings = { 97 'utf-8', 98 'ISO-8859-1', 99 'Latin 1', 100} 101utf8_compatible_encodings_lower = {encoding.lower() for encoding in utf8_compatible_encodings} 102 103 104utf8_half_compatible_encodings = { 105 'US-ASCII', 106 'ASCII', 107 'ANSI_X3.4-1968', 108 'iso-ir-6', 109 'ANSI_X3.4-1986', 110 'ISO_646.irv:1991', 111 'ASCII-7', 112 'ASCII-8', 113 'ISO646-US', 114 'us', 115 'IBM367', 116 'cp367', 117 'csASCII', 118} 119utf8_half_compatible_encodings_lower = {encoding.lower() for encoding in utf8_half_compatible_encodings} 120 121 122def is_utf8(encoding: str) -> bool: 123 return encoding.lower() in utf8_compatible_encodings_lower 124 125 126def is_probably_utf8(encoding: str) -> bool: 127 return encoding.lower() in utf8_half_compatible_encodings_lower 128 129 130def is_text_is_7bit_utf8_compatible(text: Union[bytes, bytearray]) -> bool: 131 return all(b <= 127 for b in text) 132 133 134def is_utf8_text(encoding: str, text: Union[bytes, bytearray]) -> bool: 135 if is_utf8(encoding): 136 return True 137 elif is_probably_utf8(encoding): 138 return is_text_is_7bit_utf8_compatible(text) 139 else: 140 return False 141 142 143def decode(text: Union[bytes, bytearray]) -> str: 144 text, encoding, bom_bytes = detect_and_decode(text) 145 return text
def
decode(text: Union[bytes, bytearray]) -> str:
def
detect_and_decode( text: Union[bytes, bytearray], detect_as_utf8_when_possible: bool = True, check_text_for_utf8_compliance: bool = True) -> Tuple[str, str, bytes]:
61def detect_and_decode(text: Union[bytes, bytearray], detect_as_utf8_when_possible: bool = True, check_text_for_utf8_compliance: bool = True) -> Tuple[str, str, bytes]: 62 if not text: 63 return str(), 'utf-8', bytes() 64 65 text = normalize_text(text, bytes) 66 possible_utf_bom = determine_text_bom(text) 67 text = remove_bom(text, possible_utf_bom) 68 possible_encoding = determine_bom_encoding(possible_utf_bom) 69 if possible_encoding is not None: 70 return decode_text_and_remove_all_wrong_symbols(text, possible_encoding), possible_encoding, possible_utf_bom 71 else: 72 try_charset_normalizer = False 73 try: 74 if CHARDET_PRESENT: 75 detection = chardet.detect(text) 76 else: 77 try_charset_normalizer = True 78 except LookupError: 79 try_charset_normalizer = True 80 81 if try_charset_normalizer: 82 detection = cn_detect(text) 83 84 encoding = detection["encoding"] 85 if detect_as_utf8_when_possible: 86 if check_text_for_utf8_compliance: 87 result_encoding = 'utf-8' if is_utf8_text(encoding, text) else encoding 88 else: 89 result_encoding = 'utf-8' if (is_utf8(encoding) or is_probably_utf8(encoding)) else encoding 90 else: 91 result_encoding = encoding 92 93 bom_bytes = bytes() 94 return text.decode(encoding), result_encoding, bom_bytes
def
is_utf8_text(encoding: str, text: Union[bytes, bytearray]) -> bool:
def
is_text_is_7bit_utf8_compatible(text: Union[bytes, bytearray]) -> bool:
def
is_probably_utf8(encoding: str) -> bool:
def
is_utf8(encoding: str) -> bool: