Source code for egaia.egaia_sanitize

import os
import re
from unidecode import unidecode

import egaia_log
import egaia_list

[docs]def getSanitizedName(filepath): """Create a sanitized filename""" (dirpath, filename) = os.path.split(filepath) segments = list() parts = filename.split('.') for part in parts: # FIXME: This won't actually work on non-utf8 systems segments.append(makeSlug(part)) return os.path.join(dirpath, '.'.join(segments))
[docs]def sanitize(filepath): """Sanitize a filename""" sanitized = getSanitizedName(filepath) print "Rename %s as %s..." % (filepath, sanitized) os.rename(filepath, sanitized) egaia_log.logRename(filepath, sanitized) return sanitized
[docs]def makeSlug(tag): """Create a latin-encoded keyword tag. This allows us to use a standard and predictable system for two-way mapping of URL/filenames and keywords in various character sets. """ # transliterate to ascii characters t = unidecode(unicode(tag)) # conflate multiple whitespaces # use underscores for word boundaries t = re.sub(ur'\s+', u'_', t.strip()) t = re.sub(ur'[^a-zA-Z0-9_\-\.]', u'', t) return t
[docs]def sanitizeFiles(): """Sanitize ALL the files in a collection""" filepaths = egaia_list.listFiles() if not filepaths: print "No files to sanitize!" return for filepath in filepaths: sanitize(filepath)
def _cli(args): """egaia sanitize Sanitize filenames to remove non-ascii characters. Usage: egaia sanitize --help egaia sanitize [ --dry-run ] ( --all | FILENAME... ) """ if not args['FILENAME']: args['FILENAME'] = filepaths = egaia_list.listFiles() for filename in args['FILENAME']: if args['--dry-run']: print "Original: %s" % filename print "Modified: %s" % getSanitizedName(filename) else: sanitize(filename)