Source code for egaia.egaia_meta

import os
import subprocess
import magic, mimetypes
import egaia_parsefn
import egaia_config
import utils
import json


### AUTOMATIC METADATA EXTRACTION FUNCTIONS

[docs]def getDate(filename): """Retrieve the file modification date, for dc_date. This is not necessarily reliable but is sometimes useful.""" st = os.stat(filename) return utils.isotime(st.st_mtime)
[docs]def getSize(filename): """Retrieve the file size in bytes, for dc_extent""" return utils.byteSize(os.path.getsize(filename))
[docs]def getDuration(filename): """Retrieve the duration of audiovisual media, for dc_extent""" cmd_ffprobe = egaia_config.getConfig('system', 'cmd_ffprobe') p = subprocess.Popen([cmd_ffprobe, '-show_format', filename, '-of', 'ini'], stdout=subprocess.PIPE) stdout = p.communicate()[0] for line in stdout.splitlines(): if line.startswith('duration'): return line.partition('=')[2] return None
[docs]def getDimensions(filename): """Retrieve the dimensions of images, for dc_extent""" cmd_identify = egaia_config.getConfig('system', 'cmd_identify') try: # FIXME: This fails on relative pathnames. dimensions = subprocess.check_output( [cmd_identify, '-format', '%[fx:w]x%[fx:h]', '%s[0]' % filename]) except: return None return dimensions
[docs]def getMtype(filename): """Retrieve the MIME Type, for dc_format""" if os.path.isdir(filename): # https://specifications.freedesktop.org/shared-mime-info-spec/ return 'inode/directory' mime = magic.Magic(mime=True) return mime.from_file(filename)
[docs]def getType(filename): """Identify the DCMI Type, for dc_type""" if os.path.isdir(filename): if filename.endswith('vclips'): return 'Moving Image' return 'Collection' if filename.endswith('docx'): # python-magic fails on some of our files... return 'Text' dcmiType = None mime = magic.Magic(mime=True) mtype = mime.from_file(filename) if mtype.startswith('video'): dcmiType = 'Moving Image' elif mtype.startswith('text'): dcmiType = 'Text' elif mtype.startswith('audio'): dcmiType = 'Sound' elif mtype.startswith('image'): dcmiType = 'Image' elif mtype in ('application/pdf', 'application/postscript', 'application/vnd.oasis.opendocument.text', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/msword'): dcmiType = 'Text' return dcmiType
### LOCAL FUNCTIONS
[docs]def getMetadata(filename, existing=None, restrict=False): """Extract all the metadata for a given file. Pass the "existing" parameter in order to merge new metadata with an existing record. The existing data will not be overwritten. The "restrict" parameter limits updating to DCTERMS.modified and DCTERMS.extent, which are always derived from the file on disk, and can therefore safely be updated without losing user input.""" fileparts = egaia_parsefn.parseFilename(filename) basename = fileparts[0].rpartition(os.sep)[2] item_type = getType(filename) # use the filename; extract from the path given by getBasename() item_title = basename item_identifier = fileparts[1] item_date = getDate(filename) item_format = getMtype(filename) extent_parts = list() if os.path.isfile(filename): size = getSize(filename) if size: extent_parts.append(size) if item_type and item_type in ('Sound', 'Moving Image'): duration = getDuration(filename) if duration: extent_parts.append(duration) if item_type and item_type in ('Image') and not filename.endswith('.svg'): # Ignore svg files because they cause memory black holes # FIXME: use a command, other than identify, that works with # "Moving Image" dimensions = getDimensions(filename) if dimensions: extent_parts.append('%s px' % dimensions) item_filename = '%s.%s' % (basename, fileparts[2]) # Disable DCTERMS.title and DCTERMS.date because this overwrites existing # data when we tag files if restrict: info = { egaia_config.getConfig('terms', 'DCTERMS.modified'): [item_date], egaia_config.getConfig('terms', 'DCTERMS.extent'): extent_parts, egaia_config.getConfig('terms', 'DCTERMS.type'): [item_type], egaia_config.getConfig('terms', 'DCTERMS.format'): [item_format], } else: info = { egaia_config.getConfig('terms', 'DCTERMS.title'): item_filename.replace('_', ' ').rpartition('.')[0], egaia_config.getConfig('terms', 'DCTERMS.modified'): [item_date], egaia_config.getConfig('terms', 'DCTERMS.format'): [item_format], egaia_config.getConfig('terms', 'DCTERMS.type'): [item_type], egaia_config.getConfig('terms', 'DCTERMS.identifier'): [item_identifier], egaia_config.getConfig('terms', 'DCTERMS.extent'): extent_parts, egaia_config.getConfig('terms', 'original_filename'): [item_filename], } if existing: info.update(existing) return info
def _cli(args): """egaia meta Automatically extract metadata from items in the collection, and return as a dictionary. This utility retrieves the file size, duration (for audiovisual media), dimensions (for images), and general type for a specified item. Usage: egaia meta --help egaia meta --extract=FILENAME """ if args['--extract']: print json.dumps(getMetadata(args['--extract']))