# -*- coding: utf-8 -*-
# Requirements:
# libavcodec-extra, inkscape, libreoffice, weasyprint
# libjpeg62
# wget
# wkhtmltopdf
import sys, os, subprocess, re
import shutil
from docopt import docopt
import pkg_resources
import codecs
import base64
from egaia_config import getConfig
import egaia_parsefn
import egaia_list
import egaia_meta
import utils
import egaia_root
import strings
# Do not create derivatives on low-spec systems, such as an appliance built
# on Raspberry Pi. This environment variable needs to be set manually in the
# configuration file. If derivatives creation is disabled here, we should
# synchronize the archive with another system (e.g., with rsync or btsync) and
# run the command there.
if getConfig('system', 'no_deriv', boolean=True) is True:
exit('This command is disabled on the current system.')
# FIXME: move the getConfig() functions out of the main namespace
cmd_convert = getConfig('system', 'cmd_convert')
cmd_mogrify = getConfig('system', 'cmd_mogrify')
cmd_inkscape = getConfig('system', 'cmd_inkscape')
cmd_ffmpeg = getConfig('system', 'cmd_ffmpeg')
cmd_libreoffice = getConfig('system', 'cmd_libreoffice')
cmd_wget = getConfig('system', 'cmd_wget')
cmd_wkhtmltopdf = getConfig('system', 'cmd_wkhtmltopdf')
cmd_wkhtmltoimage = getConfig('system', 'cmd_wkhtmltoimage')
# mapping of formats and known extensions
knownFormats = {
'audio': ('mp3', 'wav', 'wma', 'ogg'),
'doc': ('odt', 'odp', 'doc', 'docx', 'ppt', 'pptx'),
'raster': ('bmp', 'gif', 'jpg', 'jpeg', 'png', 'psd', 'tiff', 'tif'),
'vector': ('eps', 'svg'),
'video': ('avi', 'flv', 'mov', 'mpeg', 'mwv', 'mp4', 'webm', 'ogv', 'mkv'),
'pdf': ('pdf',),
'text': ('txt', 'rst', 'md'),
'url': ('url',),
'vclips': ('vclips',)
}
# mapping of formats and processing rules
# https://www.archivematica.org/wiki/Format_policies
# we use "med" and "thumb" as derived image sizes to make these accessible
# to html writers, even if we update the size settings later
# the video format is hard-coded because we can offer a set of options in
# the video tag
outputFormats = {
'audio': [ ('pf-wav', 'wav'),
('df-mp3', 'mp3')],
'doc': [ ('df-pdf', 'pdf'),
('df-html', 'html'),
('df-med-img','jpg'),
('df-thumb-img','jpg'),],
'raster': [ ('pf-tiff', 'tiff'),
('df-med-img','jpg'),
('df-thumb-img','jpg'),],
'vector': [ ('pf-vector', 'svg'),
('df-pdf', 'pdf'),
('df-med-img','png'),
('df-thumb-img','png'),],
'video': [ ('pf-ffv1', 'mkv'),
('df-h264', 'mp4'),
('df-360p-vp9-400k', 'webm'),
('df-stills', 'dir'),
('df-contact-sheet', 'html'),
('df-thumb-vid', 'jpg'),
('df-med-img-vid', 'jpg'),
],
'pdf': [ ('df-pdf', 'pdf'),
('df-thumb-img', 'jpg'),
('df-med-img', 'jpg'),],
'text': [ ('df-txt', 'txt'),
('df-html', 'html'),
],
'url': [ ('df-webarc', 'dir'),
('df-pdf', 'pdf'),
('df-screenshot', 'png'),
('df-med-img','png'),
('df-thumb-img','png'),],
'vclips': [ ('df-concat-list', 'txt'),
('pf-ffv1', 'mkv'),
('df-h264', 'mp4'),
('df-360p-vp9-400k', 'webm'),
('df-stills', 'dir'),
('df-contact-sheet', 'html'),
('df-thumb-vid', 'jpg'),
('df-med-img-vid', 'jpg'),
],
}
[docs]def run(arguments):
"""Run an external command. Do nothing if the "dry-run" flag is set."""
return utils.run(arguments)
[docs]def makeDeriv(rule, mtype, fn, UUID, tmpFile, outDir, outFile, basename, N):
"""Create a derivative. This function contains the processing rules and
commands."""
print "processing %s" % fn
print "rule %s" % rule
if rule == 'df-txt' and mtype in ('text',):
r = shutil.copyfile(fn, tmpFile)
elif rule == 'df-html' and mtype is 'text':
input_file = codecs.open(fn, mode='r', encoding='utf-8')
text = input_file.read()
html = utils.md2html(text)
if html:
with codecs.open(tmpFile, mode='w', encoding='utf-8') as html_file:
html_file.write(html)
r = True
elif rule == 'df-html' and mtype is 'doc' and fn.endswith('.docx'):
html = utils.docx2html(fn)
if html:
with codecs.open(tmpFile, mode='w', encoding='utf-8') as html_file:
html_file.write(html)
r = True
elif rule == 'df-pdf' and mtype is 'pdf':
r = shutil.copyfile(fn, tmpFile)
elif rule == 'df-webarc':
r = run([cmd_wget, '--input-file=%s' % fn, '--convert-links',
'--page-requisites', '--span-hosts', '--adjust-extension',
'--restrict-file-names=windows',
'--directory-prefix=%s' % tmpFile])
elif rule == 'pf-wav':
r = run([cmd_ffmpeg, '-i', fn, tmpFile])
elif rule == 'df-mp3':
r = run([cmd_ffmpeg, '-i', fn, tmpFile])
elif rule == 'df-pdf' and mtype is 'doc':
print "converting with libreoffice..."
r = run([cmd_libreoffice, '--headless', '--convert-to',
'pdf', '--outdir', outDir, fn])
#r = run([cmd_convert, fn, tmpFile])
# Rename, since libreoffice doesn't let us specify a name
pdfFile = '%s.%s.pdf' % (basename, UUID)
try:
os.rename(os.path.join(outDir, pdfFile), tmpFile)
except:
# this will fail if LibreOffice is already open
print "Error converting to PDF. Is LibreOffice already open?"
elif rule == 'df-pdf' and mtype is 'url':
with open(fn, 'r') as urlList:
urls = urlList.read().splitlines()
r = run([cmd_wkhtmltopdf, urls[N], tmpFile])
elif rule == 'df-screenshot' and mtype is 'url':
# FIXME: Don't keep downloading the resource;
# set a persistent cache file
with open(fn, 'r') as urlList:
urls = urlList.read().splitlines()
r = run([cmd_wkhtmltoimage, urls[N], tmpFile])
elif rule == 'df-med-img' and mtype is 'url':
with open(fn, 'r') as urlList:
urls = urlList.read().splitlines()
r = run([cmd_wkhtmltoimage, '--crop-h', '800', '--quality', '60',
urls[N], tmpFile])
elif rule == 'df-med-img':
if mtype in ('vector', 'doc'):
# use the generated pdf, not the original document
fn = os.path.join(outDir, '%s.df-pdf.%s.pdf' % (basename, UUID))
print "creating thumbnail from %s..." % fn
r = run([cmd_convert, '-density', '300', '%s[%s]' % (fn, N),
'-resize', '800x600>',
'-background', 'white', '-alpha', 'remove', '-auto-orient',
tmpFile])
elif rule == 'df-thumb-img':
if mtype in ('vector', 'doc'):
# use the generated pdf, not the original document
fn = os.path.join(outDir, '%s.df-pdf.%s.pdf' % (basename, UUID))
elif mtype in ('url',):
fn = '%s.df-med-img.%s.png' % (basename, UUID)
r = run([cmd_convert, '%s[%s]' % (fn, N), '-resize', '320x320>',
'-background', 'white', '-alpha', 'remove', '-auto-orient',
tmpFile])
#~ r = run([cmd_convert,
#~ '%s[%s]' % (fn, N), '-thumbnail', '320x320^',
#~ '-background', 'white', '-alpha', 'remove', '-auto-orient',
#~ '-gravity', 'center', '-extent', '320x320',
#~ tmpFile])
elif rule == 'pf-tiff':
r = run([cmd_convert, '-compress', 'none', '%s[%s]' % (fn, N),
tmpFile])
elif rule == 'pf-vector':
r = run([cmd_inkscape, fn, '--export-plain-svg=%s' % tmpFile])
elif rule == 'df-pdf' and mtype is 'vector':
r = run([cmd_inkscape, fn, '--export-pdf=%s' % tmpFile])
## VIDEO RULES
elif rule == 'df-concat-list':
if not os.path.isdir(fn):
print "%s is not a directory" % fn
return
offsets = list()
offset = 0
# get the list of clips in the top of the directory, in mtime order
filenames = list()
for root, dirs, files in os.walk(fn):
for f in files:
filenames.append(os.path.join(root, f))
filenames.sort(key=lambda x: os.path.getmtime(x))
clips = list()
for filename in filenames:
(root, ext) = os.path.splitext(filename)
if ext.lower() in ('.avi', '.mov', '.mpeg', '.mp4',
'.webm', '.ogv', '.mkv'):
try:
duration = egaia_meta.getDuration(filename)
except:
print "Error probing %s; skipping!" % filename
continue
clip_path = os.path.relpath(filename, os.path.dirname(fn))
clips.append(u'file %s' % clip_path)
clips.append(u'duration %s\n' % duration)
offsets.append(u','.join([clip_path, str(offset)]))
offset = offset + float(duration)
if not clips:
print "No video files to add!"
return
# write the ffconcat script
with codecs.open(tmpFile, mode='w', encoding='utf-8') as out:
out.write('ffconcat version 1.0\n\n' + '\n'.join(clips))
# write the offsets database
db = os.path.join(outDir, '%s.df-concat-offsets.%s.csv' % (basename, UUID))
with codecs.open(db, mode='w', encoding='utf-8') as out:
out.write('\n'.join(offsets))
r = True
elif rule == 'df-h264':
cmd_args = [cmd_ffmpeg]
if mtype == 'vclips':
# use the clips concat list as input
fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID))
cmd_args = cmd_args + [
'-f', 'concat',
'-segment_time_metadata', '1']
r = run(cmd_args + [
'-i', fn,
'-vcodec', 'libx264',
'-acodec', 'aac',
'-ab', '384K',
'-crf', '21',
'-bf', '2',
'-flags', '+cgop',
'-pix_fmt', 'yuv420p',
'-movflags', 'faststart',
'-threads', getConfig('system', 'cores'),
tmpFile] )
elif rule == 'df-360p-vp9-400k':
cmd_args = [cmd_ffmpeg]
if mtype == 'vclips':
# use the clips concat list as input
fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID))
cmd_args = cmd_args + [
'-f', 'concat',
'-segment_time_metadata', '1']
# use tmpFile instead of /dev/null or NUL to avoid prompts
# to overwrite, etc.
# http://wiki.webmproject.org/ffmpeg/vp9-encoding-guide
r1 = run(cmd_args + [
'-i', fn,
'-codec:v', 'libvpx-vp9',
'-pass', '1',
'-b:v', '400K',
'-crf', '33',
'-threads', getConfig('system', 'cores'),
'-speed', '4',
'-tile-columns', '6',
'-frame-parallel', '1',
'-vf', 'scale=-1:360',
'-an',
'-f', 'webm',
tmpFile] )
os.unlink(tmpFile)
r = run(cmd_args + [
'-i', fn,
'-codec:v', 'libvpx-vp9',
'-pass', '2',
'-b:v', '400K',
'-crf', '33',
'-threads', getConfig('system', 'cores'),
'-speed', '1',
'-tile-columns', '6',
'-frame-parallel', '1',
'-vf', 'scale=-1:360',
'-auto-alt-ref', '1',
'-lag-in-frames', '25',
'-c:a', 'libopus',
'-b:a', '64k',
'-f', 'webm',
tmpFile ])
# Remove the ffmpeg passlogfile here
pattern = "^ffmpeg2pass.*$"
for root, dirs, files in os.walk(os.getcwd()):
for file in filter(lambda x: re.match(pattern, x), files):
os.remove(os.path.join(root, file))
elif rule == 'pf-ffv1':
# We need to enable ffv1 conversion explicitly
if getConfig('system', 'ffv1', boolean=True) is False:
return
cmd_args = [cmd_ffmpeg]
if mtype == 'vclips':
# use the clips concat list as input
fn = os.path.join(outDir, '%s.df-concat-list.%s.txt' % (basename, UUID))
cmd_args = cmd_args + [
'-f', 'concat',
'-segment_time_metadata', '1']
# FIXME: Improve compression settings to get smaller files,
# or else disable conversion from open standard formats?
r = run(cmd_args + [
'-i', fn,
'-vcodec', 'ffv1',
'-acodec', 'pcm_s16le',
'-threads', getConfig('system', 'cores'),
tmpFile] )
elif rule == 'df-med-img-vid':
if mtype == 'vclips':
# derive thumb from the concatenated h264 clip
fn = os.path.join(outDir, '%s.df-h264.%s.mp4' % (basename, UUID))
# copy the main thumbnail image from halfway in
seek_time = float(egaia_meta.getDuration(fn)) // 2
# put -ss TIME *before* the input to use keyframe seeking (fast!)
r = run([cmd_ffmpeg, '-ss', '%s' % int(seek_time), '-i', fn,
'-frames:v', '1', '-vf', 'scale=800:-1', tmpFile])
elif rule == 'df-thumb-vid':
if mtype == 'vclips':
# derive thumb from the concatenated h264 clip
fn = os.path.join(outDir, '%s.df-h264.%s.mp4' % (basename, UUID))
# copy the main thumbnail image from halfway in
seek_time = float(egaia_meta.getDuration(fn)) // 2
# put -ss TIME *before* the input to use keyframe seeking (fast!)
r = run([cmd_ffmpeg, '-ss', '%s' % int(seek_time), '-i', fn,
'-frames:v', '1', '-vf', 'scale=320:-1', tmpFile])
elif rule == 'df-stills':
# here `outFile` is a directory path, not a file
if os.path.isdir(outFile):
# we need to delete the directory if the ``--force`` flag is given
shutil.rmtree(outFile)
os.mkdir(outFile)
# use just the filename, otherwise files get written to the wrong place
basename_fn = os.path.basename(basename)
if mtype == 'vclips':
# get the list of clips in the directory, in mtime order
filenames = list()
for root, dirs, files in os.walk(fn):
for f in files:
filenames.append(os.path.join(root, f))
filenames.sort(key=lambda x: os.path.getmtime(x))
clips = list()
for filename in filenames:
(root, ext) = os.path.splitext(filename)
if ext.lower() in ('.avi', '.mov', '.mpeg', '.mp4',
'.webm', '.ogv', '.mkv'):
clips.append(filename)
else:
clips = [fn]
for n, c in enumerate(clips):
print c
r = run([ cmd_ffmpeg,
'-i', c,
'-vf', 'fps=1/6',
os.path.join(outFile,
'%03d.%s.%%05d.jpg' % (n, egaia_parsefn.getBasename(c)))
])
if r is False:
continue
# resize the images to something sensible
r = run([cmd_mogrify, '-units', 'PixelsPerInch', '-density',
'300', '-resize', '240x180>',
os.path.join(outFile, '*.jpg')])
images = os.listdir(outFile)
images.sort()
for idx, image in enumerate(images):
# [clip no., basename parts..., frame no., ext]
image_parts = image.split('.')
clipname = '.'.join(image_parts[1:-2])
# image numbering starts at 1; time starts at 0
N = int(image_parts[-2])-1
if N < 0:
N = 0
secs = N*6
# rename to include second count rather than timestamp
new_image = '%s.df-still.%05d.%s.jpg' % (clipname, secs, UUID)
os.rename(os.path.join(outFile, image), os.path.join(outFile, new_image))
elif rule == 'df-contact-sheet':
# get the list of images
stills_dir = os.path.join(outDir, '%s.df-stills.%s.dir' % (basename, UUID))
if not os.path.isdir(stills_dir):
print "ERROR: %s is not a directory or does not exist" % stills_dir
images = os.listdir(stills_dir)
images.sort()
# create html page.
# Write directly so as to limit memory use -- is this an issue with
# smaller thumbnails?
html = open(outFile, 'w')
html.write(u'<!DOCTYPE HTML>')
html.write(u'<html><head></head>')
html.write(u'<body>')
html.write(u'<h1>%s</h1>' % UUID)
for idx, image in enumerate(images):
# [clip no., basename parts..., frame no., ext]
image_parts = image.split('.')
clipname = '.'.join(image_parts[0:-4])
secs = int(image_parts[-3])
t = utils.fmtTime(secs)
if secs == 0:
# first thumb. idx won't work in clip sequences.
in_point = '0'
out_point = '3'
elif idx == len(images) - 1:
# last thumb
in_point = str(secs-3)
out_point = str(secs)
else:
in_point = str(secs-3)
out_point = str(secs+3)
cb_txt = u' '.join([UUID, clipname, in_point, out_point])
img_fn = os.path.join(stills_dir, image)
encoded_img = base64.b64encode(open(img_fn, "rb").read())
# FIXME: Don't overshoot the last image in concatenated sequences
html.write(u' '.join([
u'\n<img src="data:image/jpeg;base64,%s"' % encoded_img,
u'title="%s %s"' % (clipname, t),
u'class="btn"',
u'data-clipboard-text="%s">' % cb_txt
]))
html.write(u'<script type="text/javascript">')
html.write(strings.clipboard_js)
html.write(strings.clipboard_js_call)
html.write(u'</script>')
html.write(u'</body></html>')
html.close()
r = True
else:
# programming error?
return
if r is False:
return
if os.path.exists(tmpFile):
# move temporary file to the working directory
os.rename(tmpFile, outFile)
print "Converted to %s" % outFile
return
[docs]def process(uuid=None, frame='0', force=False, update=False):
"""Generate a list of files to process"""
if uuid:
filenames = egaia_list.listFiles(filter_type='originals', uuid=uuid)
else:
filenames = egaia_list.listFiles(filter_type='originals')
filenames = filenames + egaia_list.listDirs(filter_type='originals')
if not filenames:
print "Nothing to process!"
return
for fn in filenames:
findRules(fn, frame, force, update)
return
[docs]def findRules(fn, frame, force, update):
"""Find the matching processing rule(s) for a given filename"""
(basename, UUID, extension) = egaia_parsefn.parseFilename(fn)
if UUID is None:
# don't automatically add uuid if missing; this should be done manually
print "No uuid tag for this file; ignoring"
return
ext = extension.lower()
mtype = None
for fileFormat, extensions in knownFormats.iteritems():
if ext in extensions:
mtype = fileFormat
break
if not mtype:
print 'No conversion rule for this file; ignoring'
return
for (rule, suffix) in outputFormats[mtype]:
outFn = '.'.join([basename, rule, UUID, suffix])
# If items are in subdirectories, we want to keep those.
outDir = os.path.dirname(fn)
outFile = os.path.join(outDir, outFn)
# FIXME: This is not a very robust check -- we may have a derivative
# that has been moved to a different directory.
if os.path.exists(outFile):
if not force and not update:
print '%s already exists' % outFile
continue
if update:
# check the modification times of the source and destination
src_mtime = os.stat(fn).st_mtime
dest_mtime = os.stat(outFile).st_mtime
# don't copy/link if the destination is newer or the same
if src_mtime <= dest_mtime:
#print "%s is newer than the source; ignoring" % outFile
continue
# We will use suffixed temporary file: <FILENAME>.tmp.<EXT>
# A temporary file can safely be deleted if the conversion fails
# We can't use a prefix here because fn might include directory paths
tmpFile = '.'.join([basename, rule, UUID, 'tmp', suffix])
makeDeriv(rule, mtype, fn, UUID, tmpFile, outDir, outFile, basename, frame)
def _cli(args):
"""egaia derive
Generate derivative formats for archival storage and for distribution.
Usage:
egaia derive --help
egaia derive [ --item=ITEM ] [ --frame=N ] [ --force | --update ]
"""
if not egaia_root.get_root():
exit('Please run "egaia init" to create a new collection here.')
frame = args['--frame']
if frame is None:
frame = '1'
# use input numbering that starts with 1
frame = int(frame) - 1
if args['--item']:
process(uuid=egaia_parsefn.getUuid(args['--item']), frame=frame,
force=args['--force'])
else:
process(frame=frame, force=args['--force'], update=args['--update'])
return