cengal.text_processing.encoding_detection.versions.v_0.encoding_detection

View Source

  1#!/usr/bin/env python
  2# coding=utf-8
  3
  4# Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>
  5# 
  6# Licensed under the Apache License, Version 2.0 (the "License");
  7# you may not use this file except in compliance with the License.
  8# You may obtain a copy of the License at
  9# 
 10#     http://www.apache.org/licenses/LICENSE-2.0
 11# 
 12# Unless required by applicable law or agreed to in writing, software
 13# distributed under the License is distributed on an "AS IS" BASIS,
 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15# See the License for the specific language governing permissions and
 16# limitations under the License.
 17
 18
 19__all__ = [
 20    'decode',
 21    'detect_and_decode',
 22    'is_utf8_text',
 23    'is_text_is_7bit_utf8_compatible',
 24    'is_probably_utf8',
 25    'is_utf8',
 26]
 27
 28
 29from typing import Tuple, Union
 30import cchardet as chardet
 31from cengal.modules_management.alternative_import import alt_import
 32with alt_import('cchardet') as chardet:
 33    if chardet is None:
 34        CHARDET_PRESENT: bool = False
 35    else:
 36        CHARDET_PRESENT = True
 37
 38from charset_normalizer import detect as cn_detect
 39from cengal.text_processing.text_processing import Text, normalize_text
 40from cengal.text_processing.utf_bom_processing import *
 41
 42
 43"""
 44Module Docstring
 45Docstrings: http://www.python.org/dev/peps/pep-0257/
 46"""
 47
 48__author__ = "ButenkoMS <gtalk@butenkoms.space>"
 49__copyright__ = "Copyright © 2012-2024 ButenkoMS. All rights reserved. Contacts: <gtalk@butenkoms.space>"
 50__credits__ = ["ButenkoMS <gtalk@butenkoms.space>", ]
 51__license__ = "Apache License, Version 2.0"
 52__version__ = "4.4.1"
 53__maintainer__ = "ButenkoMS <gtalk@butenkoms.space>"
 54__email__ = "gtalk@butenkoms.space"
 55# __status__ = "Prototype"
 56__status__ = "Development"
 57# __status__ = "Production"
 58
 59
 60def detect_and_decode(text: Union[bytes, bytearray], detect_as_utf8_when_possible: bool = True, check_text_for_utf8_compliance: bool = True) -> Tuple[str, str, bytes]:
 61    if not text:
 62        return str(), 'utf-8', bytes()
 63
 64    text = normalize_text(text, bytes)
 65    possible_utf_bom = determine_text_bom(text)
 66    text = remove_bom(text, possible_utf_bom)
 67    possible_encoding = determine_bom_encoding(possible_utf_bom)
 68    if possible_encoding is not None:
 69        return decode_text_and_remove_all_wrong_symbols(text, possible_encoding), possible_encoding, possible_utf_bom
 70    else:
 71        try_charset_normalizer = False
 72        try:
 73            if CHARDET_PRESENT:
 74                detection = chardet.detect(text)
 75            else:
 76                try_charset_normalizer = True
 77        except LookupError:
 78            try_charset_normalizer = True
 79        
 80        if try_charset_normalizer:
 81            detection = cn_detect(text)
 82            
 83        encoding = detection["encoding"]
 84        if detect_as_utf8_when_possible:
 85            if check_text_for_utf8_compliance:
 86                result_encoding = 'utf-8' if is_utf8_text(encoding, text) else encoding
 87            else:
 88                result_encoding = 'utf-8' if (is_utf8(encoding) or is_probably_utf8(encoding)) else encoding
 89        else:
 90            result_encoding = encoding
 91        
 92        bom_bytes = bytes()
 93        return text.decode(encoding), result_encoding, bom_bytes
 94
 95
 96utf8_compatible_encodings = {
 97    'utf-8',
 98    'ISO-8859-1',
 99    'Latin 1',
100}
101utf8_compatible_encodings_lower = {encoding.lower() for encoding in utf8_compatible_encodings}
102
103
104utf8_half_compatible_encodings = {
105    'US-ASCII',
106    'ASCII',
107    'ANSI_X3.4-1968',
108    'iso-ir-6',
109    'ANSI_X3.4-1986',
110    'ISO_646.irv:1991',
111    'ASCII-7',
112    'ASCII-8',
113    'ISO646-US',
114    'us',
115    'IBM367',
116    'cp367',
117    'csASCII',
118}
119utf8_half_compatible_encodings_lower = {encoding.lower() for encoding in utf8_half_compatible_encodings}
120
121
122def is_utf8(encoding: str) -> bool:
123    return encoding.lower() in utf8_compatible_encodings_lower
124
125
126def is_probably_utf8(encoding: str) -> bool:
127    return encoding.lower() in utf8_half_compatible_encodings_lower
128
129
130def is_text_is_7bit_utf8_compatible(text: Union[bytes, bytearray]) -> bool:
131    return all(b <= 127 for b in text)
132
133
134def is_utf8_text(encoding: str, text: Union[bytes, bytearray]) -> bool:
135    if is_utf8(encoding):
136        return True
137    elif is_probably_utf8(encoding):
138        return is_text_is_7bit_utf8_compatible(text)
139    else:
140        return False
141
142
143def decode(text: Union[bytes, bytearray]) -> str:
144    text, encoding, bom_bytes = detect_and_decode(text)
145    return text

def decode(text: Union[bytes, bytearray]) -> str: View Source

144def decode(text: Union[bytes, bytearray]) -> str:
145    text, encoding, bom_bytes = detect_and_decode(text)
146    return text

def detect_and_decode( text: Union[bytes, bytearray], detect_as_utf8_when_possible: bool = True, check_text_for_utf8_compliance: bool = True) -> Tuple[str, str, bytes]: View Source

61def detect_and_decode(text: Union[bytes, bytearray], detect_as_utf8_when_possible: bool = True, check_text_for_utf8_compliance: bool = True) -> Tuple[str, str, bytes]:
62    if not text:
63        return str(), 'utf-8', bytes()
64
65    text = normalize_text(text, bytes)
66    possible_utf_bom = determine_text_bom(text)
67    text = remove_bom(text, possible_utf_bom)
68    possible_encoding = determine_bom_encoding(possible_utf_bom)
69    if possible_encoding is not None:
70        return decode_text_and_remove_all_wrong_symbols(text, possible_encoding), possible_encoding, possible_utf_bom
71    else:
72        try_charset_normalizer = False
73        try:
74            if CHARDET_PRESENT:
75                detection = chardet.detect(text)
76            else:
77                try_charset_normalizer = True
78        except LookupError:
79            try_charset_normalizer = True
80        
81        if try_charset_normalizer:
82            detection = cn_detect(text)
83            
84        encoding = detection["encoding"]
85        if detect_as_utf8_when_possible:
86            if check_text_for_utf8_compliance:
87                result_encoding = 'utf-8' if is_utf8_text(encoding, text) else encoding
88            else:
89                result_encoding = 'utf-8' if (is_utf8(encoding) or is_probably_utf8(encoding)) else encoding
90        else:
91            result_encoding = encoding
92        
93        bom_bytes = bytes()
94        return text.decode(encoding), result_encoding, bom_bytes

def is_utf8_text(encoding: str, text: Union[bytes, bytearray]) -> bool: View Source

135def is_utf8_text(encoding: str, text: Union[bytes, bytearray]) -> bool:
136    if is_utf8(encoding):
137        return True
138    elif is_probably_utf8(encoding):
139        return is_text_is_7bit_utf8_compatible(text)
140    else:
141        return False

def is_text_is_7bit_utf8_compatible(text: Union[bytes, bytearray]) -> bool: View Source

131def is_text_is_7bit_utf8_compatible(text: Union[bytes, bytearray]) -> bool:
132    return all(b <= 127 for b in text)

def is_probably_utf8(encoding: str) -> bool: View Source

127def is_probably_utf8(encoding: str) -> bool:
128    return encoding.lower() in utf8_half_compatible_encodings_lower

def is_utf8(encoding: str) -> bool: View Source

123def is_utf8(encoding: str) -> bool:
124    return encoding.lower() in utf8_compatible_encodings_lower