import hashlib
import re
import unicodedata
from pathlib import Path
from typing import Union
[docs]
re_slugify = re.compile(r"[^\w\s-]", re.UNICODE)
[docs]
SUBSTITUTION_RE = re.compile(r"[^\w\-\.]")
[docs]
MULTI_RE = re.compile(r"_{2,}")
[docs]
def clean_datapackage_name(name: str) -> str:
"""Clean string ``name`` of characters not allowed in data package names.
Replaces with underscores, and drops multiple underscores."""
return re.sub(MULTI_RE, "_", re.sub(SUBSTITUTION_RE, "_", name).strip("_")).strip()
[docs]
def safe_filename(string: Union[str, bytes], add_hash: bool = True, full: bool = False) -> str:
"""Convert arbitrary strings to make them safe for filenames. Substitutes strange characters, and uses unicode normalization.
if `add_hash`, appends hash of `string` to avoid name collisions.
From http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python
"""
safe = re.sub(
r"[-\s]+",
"-",
str(re_slugify.sub("", unicodedata.normalize("NFKD", str(string))).strip()),
)
if add_hash:
if isinstance(string, str):
string = string.encode("utf8")
if full:
safe += "." + hashlib.md5(string).hexdigest()
else:
safe += "." + hashlib.md5(string).hexdigest()[:8]
return safe
[docs]
def md5(filepath: Union[str, Path], blocksize: int = 65536) -> str:
"""Generate MD5 hash for file at `filepath`"""
hasher = hashlib.md5()
fo = open(filepath, "rb")
buf = fo.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = fo.read(blocksize)
return hasher.hexdigest()