Skip to content

Commit

Permalink
Fix #424: is binary (#425)
Browse files Browse the repository at this point in the history
* Fix #424: basics

* Fix #424: ckp

* Fix #424: test start

* Fix #424: doc start

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: CKP

* Fix #424: decouple parse from IO

* Fix #424: ckp

* Fix #424: wip

* Fix #424: ckp

* Fix #424: checks 3 bytes further

* Fix #424: cleanup

* Fix #424: ckp

* Fix 424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: test ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: doc

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

* Fix #424: ckp

---------

Co-authored-by: git-user <git-email>
  • Loading branch information
gurhar1133 authored Jan 31, 2024
1 parent f5da92a commit e39fce3
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 5 deletions.
20 changes: 18 additions & 2 deletions pykern/pkio.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# -*- coding: utf-8 -*-
"""Useful I/O operations
:copyright: Copyright (c) 2015 RadiaSoft LLC. All Rights Reserved.
:license: http://www.apache.org/licenses/LICENSE-2.0.html
"""
from __future__ import absolute_import, division, print_function

# Root module: Limit imports to avoid dependency issues
from pykern import pkconst
Expand All @@ -17,6 +15,7 @@
import os
import os.path
import py
import pykern.util
import random
import re
import shutil
Expand Down Expand Up @@ -128,6 +127,23 @@ def has_file_extension(filename, to_check):
return e in to_check


def is_pure_text(filepath, test_size=512):
"""Read test_size bytes of filepath to determine if it is likely a text file.
See `pykern.util.is_pure_text` for the heuristics used to test bytes.
Args:
filepath (str|py.path): file to check
test_size (int): number of bytes to read from filename
Returns:
bool: True if file is likely pure text, false if likely binary
"""
with open(filepath, "rb") as f:
b = f.read(test_size + 1)
return pykern.util.is_pure_text(b[:test_size], is_truncated=len(b) > test_size)


def mkdir_parent(path):
"""Create the directories and their parents (if necessary)
Expand Down
59 changes: 58 additions & 1 deletion pykern/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
"""Support routines, including run dir resolution.
:copyright: Copyright (c) 2023 RadiaSoft LLC. All Rights Reserved.
Expand All @@ -12,8 +11,10 @@
import sys


_ACCEPTABLE_CONTROL_CODE_RATIO = 0.33
_DEFAULT_ROOT = "run"
_DEV_ONLY_FILES = ("setup.py", "pyproject.toml")
_VALID_ASCII_CONTROL_CODES = frozenset((0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x1B))


def cfg_absolute_dir(value):
Expand Down Expand Up @@ -65,3 +66,59 @@ def _check_dev_files(root):
# Don't run from an install directory
r = pkio.py_path()
return pkio.mkdir_parent(r.join(_DEFAULT_ROOT))


def is_pure_text(value, is_truncated=False):
"""Guesses if value is text data using heuristics:
Checks if value can be utf-8 decoded.
If fails to decode and is_truncated, probes backwards up to 4 chars
(4 is maximum length of a utf-8 char) in case a valid utf-8
char was truncated at the boundary of value.
Returns False if null byte is present.
On successful decode, checks that the amount of control codes not
typical of text data do not exceed one third of the total characters.
Args:
value (bytes): bytes data
is_truncated (bool): whether or not value has been truncated
Returns:
bool: True if bytes_data is likely pure text, false if likely binary
"""

def _is_accepted_control_code_ratio(text_value):
n = 0
for c in text_value:
if ord(c) == 0:
return False
if ord(c) < 32 and ord(c) not in _VALID_ASCII_CONTROL_CODES:
n += 1
return (n / len(text_value)) < _ACCEPTABLE_CONTROL_CODE_RATIO

def _try_utf8(chunk):
try:
return chunk.decode("utf-8", "strict")
except UnicodeDecodeError:
return False

def _utf8_decoded(value):
if not is_truncated:
return _try_utf8(value)
b = value[: len(value)]
for _ in range(4):
if d := _try_utf8(b):
return d
if len(b) <= 1:
return False
b = b[:-1]
return False

if value == b"":
return True
if d := _utf8_decoded(value):
return _is_accepted_control_code_ratio(d)
return False
Binary file added tests/pkio_data/binary.dat
Binary file not shown.
1 change: 1 addition & 0 deletions tests/pkio_data/text.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Example text data
11 changes: 9 additions & 2 deletions tests/pkio_test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# -*- coding: utf-8 -*-
"""PyTest for :mod:`pykern.pkio`
:copyright: Copyright (c) 2015 Bivio Software, Inc. All Rights Reserved.
:license: http://www.apache.org/licenses/LICENSE-2.0.html
"""
from __future__ import absolute_import, division, print_function
import glob
import os
import py
Expand Down Expand Up @@ -59,6 +57,15 @@ def test_has_file_extension():
pkeq(False, pkio.has_file_extension("filename_with_no_extension", "json"))


def test_is_pure_text():
from pykern import pkunit
from pykern import pkio

d = pkunit.data_dir()
pkunit.pkeq(False, pkio.is_pure_text(d.join("binary.dat")))
pkunit.pkeq(True, pkio.is_pure_text(d.join("text.dat")))


def test_py_path():
from pykern import pkunit
from pykern import pkio
Expand Down
33 changes: 33 additions & 0 deletions tests/util_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""PyTest for :mod:`pykern.util`
:copyright: Copyright (c) 2022 Bivio Software, Inc. All Rights Reserved.
:license: http://www.apache.org/licenses/LICENSE-2.0.html
"""


def test_is_pure_text():
from pykern import util
from pykern import pkunit

def _false(value, is_truncated=False):
pkunit.pkeq(False, util.is_pure_text(value, is_truncated=is_truncated))

def _true(value, is_truncated=False):
pkunit.pkeq(True, util.is_pure_text(value, is_truncated=is_truncated))

a = "a".encode("utf-8")
_false(b"\0 one null causes failure in valid text")
_false(a + b"\xc2")
_false(bytes(range(1, 0x20)))
_false(b"\xd4\x16\xc0\xd6\xec\xbf\x92\xe6\x84T\xc9 \xe9\xbf")
# backwards probing on non-text case
_false(a + b"\xc2\xc2\xc2\xc2", is_truncated=True)
# boundary of control code ratio
_false(b"\x01" * 33 + b"\x07" * 67)
_true(b"\x01" * 32 + b"\x07" * 68)
_true(b"")
_true(b"This is example text")
_true(b"\x07\x08\t\n\x0b\x0c\r\x0e\x0f")
# backwards probing on text case
_true(a + "¡".encode("utf-8"), is_truncated=True)
_true(a + b"\xf0\x9f\x8c\xae", is_truncated=True)

0 comments on commit e39fce3

Please sign in to comment.