import os
import re
from unidecode import unidecode
import egaia_log
import egaia_list
[docs]def getSanitizedName(filepath):
"""Create a sanitized filename"""
(dirpath, filename) = os.path.split(filepath)
segments = list()
parts = filename.split('.')
for part in parts:
# FIXME: This won't actually work on non-utf8 systems
segments.append(makeSlug(part))
return os.path.join(dirpath, '.'.join(segments))
[docs]def sanitize(filepath):
"""Sanitize a filename"""
sanitized = getSanitizedName(filepath)
print "Rename %s as %s..." % (filepath, sanitized)
os.rename(filepath, sanitized)
egaia_log.logRename(filepath, sanitized)
return sanitized
[docs]def makeSlug(tag):
"""Create a latin-encoded keyword tag. This allows us to use a standard and
predictable system for two-way mapping of URL/filenames and keywords in
various character sets.
"""
# transliterate to ascii characters
t = unidecode(unicode(tag))
# conflate multiple whitespaces
# use underscores for word boundaries
t = re.sub(ur'\s+', u'_', t.strip())
t = re.sub(ur'[^a-zA-Z0-9_\-\.]', u'', t)
return t
[docs]def sanitizeFiles():
"""Sanitize ALL the files in a collection"""
filepaths = egaia_list.listFiles()
if not filepaths:
print "No files to sanitize!"
return
for filepath in filepaths:
sanitize(filepath)
def _cli(args):
"""egaia sanitize
Sanitize filenames to remove non-ascii characters.
Usage:
egaia sanitize --help
egaia sanitize [ --dry-run ] ( --all | FILENAME... )
"""
if not args['FILENAME']:
args['FILENAME'] = filepaths = egaia_list.listFiles()
for filename in args['FILENAME']:
if args['--dry-run']:
print "Original: %s" % filename
print "Modified: %s" % getSanitizedName(filename)
else:
sanitize(filename)