diff --git a/Dockerfile b/Dockerfile index 0930b64..4f8ca13 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,26 @@ -FROM alpine:3.9 -MAINTAINER Bartosz Balis - -ENV HYPERFLOW_JOB_EXECUTOR_VERSION=v1.0.11 - -RUN apk --update add openjdk7-jre \ - && apk add curl bash npm \ - && apk add --no-cache --repository http://dl-cdn.alpinelinux.org/alpine/v3.9/main/ nodejs=10.14.2-r0 \ - && apk add python3 libpcap libpcap-dev util-linux - -RUN npm install -g https://github.com/hyperflow-wms/hyperflow-job-executor/archive/${HYPERFLOW_JOB_EXECUTOR_VERSION}.tar.gz - -WORKDIR /soykb -COPY software/software.tar.gz . -RUN tar zxvf software.tar.gz -RUN chmod +x software/bwa-0.7.4/bwa -COPY software/*-wrapper ./ -COPY software/libnethogs.so.0.8.5-63-g68033bf /usr/local/lib -COPY software/nethogs-wrapper.py /usr/local/bin -RUN chmod +x /usr/local/bin/nethogs-wrapper.py - -ENV PATH="/soykb:${PATH}" +FROM archlinux +MAINTAINER Mateusz Plinta + +ENV HYPERFLOW_JOB_EXECUTOR_VERSION=v1.0.13 + +RUN pacman -Sy +RUN pacman -S --needed --noconfirm git jre7-openjdk npm python3 libpcap util-linux base-devel libffi glibc lib32-glibc + +RUN pacman -S --needed --noconfirm sudo +RUN useradd builduser -m +RUN passwd -d builduser +RUN printf 'builduser ALL=(ALL) ALL\n' | tee -a /etc/sudoers +RUN sudo -u builduser bash -c 'cd ~ && git clone https://aur.archlinux.org/ncurses5-compat-libs.git && cd ncurses5-compat-libs && makepkg -si --skippgpcheck --noconfirm' +RUN sudo -u builduser bash -c 'cd ~ && git clone https://aur.archlinux.org/libffi6.git && cd libffi6 && makepkg -si --noconfirm' + +RUN npm install -g https://github.com/hyperflow-wms/hyperflow-job-executor/archive/${HYPERFLOW_JOB_EXECUTOR_VERSION}.tar.gz + +WORKDIR /soykb +COPY software/software.tar.gz . +RUN tar zxvf software.tar.gz +COPY software/*-wrapper ./ +COPY software/libnethogs.so.0.8.5-63-g68033bf /usr/local/lib +COPY software/nethogs-wrapper.py /usr/local/bin +RUN chmod +x /usr/local/bin/nethogs-wrapper.py + +ENV PATH="/soykb:${PATH}" diff --git a/Dockerfile.alpine b/Dockerfile.alpine new file mode 100644 index 0000000..3367977 --- /dev/null +++ b/Dockerfile.alpine @@ -0,0 +1,24 @@ +# FROM alpine:3.11 +FROM frolvlad/alpine-glibc +MAINTAINER Bartosz Balis + +ENV HYPERFLOW_JOB_EXECUTOR_VERSION=v1.0.11 + +RUN apk --update add openjdk7-jre \ + && apk add curl bash ncurses ncurses5 ncurses5-libs npm \ +# && apk add --no-cache --repository http://dl-cdn.alpinelinux.org/alpine/v3.11/main/ nodejs=10.14.2-r0 \ + && apk add python3 libpcap libpcap-dev util-linux + +RUN npm install -g https://github.com/hyperflow-wms/hyperflow-job-executor/archive/${HYPERFLOW_JOB_EXECUTOR_VERSION}.tar.gz + +RUN ln -s /usr/lib/libncurses.so.5 /usr/lib/libtinfo.so.5 + +WORKDIR /soykb +COPY software/software.tar.gz . +RUN tar zxvf software.tar.gz +COPY software/*-wrapper ./ +COPY software/libnethogs.so.0.8.5-63-g68033bf /usr/local/lib +COPY software/nethogs-wrapper.py /usr/local/bin +RUN chmod +x /usr/local/bin/nethogs-wrapper.py + +ENV PATH="/soykb:${PATH}" diff --git a/Makefile b/Makefile index 0743019..86ac629 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,16 @@ TAG = $(shell git describe --tags --always) -PREFIX = hyperflowwms -REPO_NAME = soykb-workflow-worker +# PREFIX = $(shell git config --get remote.origin.url | tr ':.' '/' | rev | cut -d '/' -f 3 | rev) +# REPO_NAME = $(shell git config --get remote.origin.url | tr ':.' '/' | rev | cut -d '/' -f 2 | rev) + +REPO_NAME = 'soykb-worker' +PREFIX = 'hyperflowwms' all: push container: image image: - docker build -t $(PREFIX)/$(REPO_NAME) . # Build new image and automatically tag it as latest + docker build --no-cache -t $(PREFIX)/$(REPO_NAME) . # Build new image and automatically tag it as latest docker tag $(PREFIX)/$(REPO_NAME) $(PREFIX)/$(REPO_NAME):$(TAG) # Add the version tag to the latest image push: image diff --git a/README.md b/README.md index 424d9cf..7c4cf04 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,16 @@ # Soykb workflow for HyperFlow [![](https://images.microbadger.com/badges/version/hyperflowwms/soykb-workflow-worker.svg)](https://microbadger.com/images/hyperflowwms/soykb-workflow-worker "Get your own version badge on microbadger.com") +## Generate example workflows + +Generate example workflow: +- `genwf-size2.sh` (size 2) + +The scripts invoke Docker images and create: +- `data` subdirectory with workflow `workflow.json` and `haplotype-files.list` + +You can also directly use the `hyperflowwms/soykb-generator` image to generate other workflows, see the scripts for command examples. For example, to generate smaller workflows, use a smaller value of the fastq files parameter. + ## Build and publish image HyperFlow Docker image contains Soykb binaries and HyperFlow job executor diff --git a/genwf-size2.sh b/genwf-size2.sh new file mode 100755 index 0000000..46c86fe --- /dev/null +++ b/genwf-size2.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +docker run -v $PWD:/workdir hyperflowwms/soykb-generator sh -c 'generate-workflow 2' \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..452dc09 --- /dev/null +++ b/run.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# This script runs everything in containers, so that you only need Docker on your host machine + +echo Before running this script, start Redis container as follows: +echo docker run -d --name redis redis --bind 127.0.0.1 +echo + +docker run -a stdout -a stderr --rm --network container:redis -e HF_VAR_WORKER_CONTAINER="hyperflowwms/soykb-workflow-worker" -e HF_VAR_WORK_DIR="$PWD/data" -e HF_VAR_HFLOW_IN_CONTAINER="true" -e HF_VAR_function="redisCommand" -e REDIS_URL="redis://127.0.0.1:6379" --name hyperflow -v /var/run/docker.sock:/var/run/docker.sock -v $PWD:/wfdir --entrypoint "/bin/sh" hyperflowwms/hyperflow:v1.3.23 -c "apk add docker && hflow run /wfdir" diff --git a/software/bwa-wrapper b/software/bwa-wrapper index 6d15a87..fe3cf39 100755 --- a/software/bwa-wrapper +++ b/software/bwa-wrapper @@ -2,5 +2,7 @@ set -e +export TMPDIR=`pwd` + /soykb/software/bwa-0.7.4/bwa "$@" diff --git a/software/gatk-wrapper b/software/gatk-wrapper index a8d4a74..ab6b463 100755 --- a/software/gatk-wrapper +++ b/software/gatk-wrapper @@ -4,14 +4,16 @@ export TMPDIR=`pwd` OUTFILE=`mktemp -t gatk-output.XXXXXXXXXX` || exit 1 -# memory depends on what subsystem we call -OPTIONS="-Xmx2g -XX:+UseSerialGC" -#if (echo "'$@'" | grep -i "HaplotypeCaller") >/dev/null; then -# OPTIONS="-Xmx2g -XX:+UseSerialGC" -#elif (echo "'$@'" | grep -i "CombineGVCFs") >/dev/null; then -# OPTIONS="-Xmx17g -XX:+UseSerialGC" -#fi -OPTIONS="$OPTIONS -Djava.io.tmpdir=$TMPDIR" +# first argument is memory, rest is GATK args +MEM_TOTAL=$1 +shift + +# Java mx should be a little bit lower than requested memory +MEM_JAVA_MX=$(($MEM_TOTAL - 2)) + +OPTIONS="-Xmx${MEM_JAVA_MX}g -XX:+UseSerialGC" + +OPTIONS="-Djava.io.tmpdir=$TMPDIR $OPTIONS" java $OPTIONS \ -jar /soykb/software/GenomeAnalysisTK-3.0.0/GenomeAnalysisTK.jar \ diff --git a/software/gunzip-wrapper b/software/gunzip-wrapper new file mode 100755 index 0000000..1ca980c --- /dev/null +++ b/software/gunzip-wrapper @@ -0,0 +1,7 @@ +#!/bin/bash + +set -e + +gunzip -c $1 > $2 + + diff --git a/software/picard-wrapper b/software/picard-wrapper index 81d41f2..4ab9377 100755 --- a/software/picard-wrapper +++ b/software/picard-wrapper @@ -1,6 +1,7 @@ #!/bin/bash set -e +export TMPDIR=`pwd` -java -Xmx2g -XX:+UseSerialGC -jar /soykb/software/picard-tools-1.92/"$@" +java -Djava.io.tmpdir=$TMPDIR -Xmx15g -XX:+UseSerialGC -jar /soykb/software/picard-tools-1.92/"$@" diff --git a/software/samtools-wrapper b/software/samtools-wrapper new file mode 100755 index 0000000..2b66661 --- /dev/null +++ b/software/samtools-wrapper @@ -0,0 +1,15 @@ +#!/bin/bash + +set -e + +export TMPDIR=`pwd` + +# sometimes we are asked to "merge" only one file +if [ "X$1" = "Xmerge" -a "X$4" = "X" ]; then + # just copy + cp "$3" "$2" + exit 0 +fi + +/soykb/software/samtools-1.0/samtools "$@" + diff --git a/software/software-wrapper b/software/software-wrapper new file mode 100755 index 0000000..43cdd4f --- /dev/null +++ b/software/software-wrapper @@ -0,0 +1,13 @@ +#!/bin/bash + +set -e + +if [ ! -d /soykb/software ]; then + tar xzf /soykb/software.tar.gz +fi + +# fix for leftover files in the home directory at TACC +find ~/ -maxdepth 1 -name slurm.\* -mtime +5 -exec rm -f {} \; || /bin/true +find ~/ -maxdepth 1 -name gram\*.log -mtime +5 -exec rm -f {} \; || /bin/true + + diff --git a/software/software.tar.gz b/software/software.tar.gz index 892e014..119e38f 100644 Binary files a/software/software.tar.gz and b/software/software.tar.gz differ diff --git a/workflow-generator/.gitignore b/workflow-generator/.gitignore new file mode 100644 index 0000000..52ff8b6 --- /dev/null +++ b/workflow-generator/.gitignore @@ -0,0 +1,3 @@ +*.pyc +software +software.tar.gz diff --git a/workflow-generator/Dockerfile b/workflow-generator/Dockerfile new file mode 100644 index 0000000..4bc1c0d --- /dev/null +++ b/workflow-generator/Dockerfile @@ -0,0 +1,15 @@ +FROM mhart/alpine-node:12 + +LABEL maintainer "Mateusz Plinta " + +RUN apk add python-dev + +RUN mkdir /soykb-workflow + +ADD . /soykb-workflow/ + +RUN npm install https://github.com/hyperflow-wms/pegasus-hyperflow-converter/archive/master.tar.gz / + +ENV PATH /soykb-workflow:/node_modules/.bin:$PATH +ENV PYTHONPATH=/soykb-workflow +WORKDIR /soykb-workflow diff --git a/workflow-generator/Makefile b/workflow-generator/Makefile new file mode 100644 index 0000000..ea8f0f5 --- /dev/null +++ b/workflow-generator/Makefile @@ -0,0 +1,20 @@ +TAG = $(shell git describe --tags --always) +# PREFIX = $(shell git config --get remote.origin.url | tr ':.' '/' | rev | cut -d '/' -f 3 | rev) +# REPO_NAME = $(shell git config --get remote.origin.url | tr ':.' '/' | rev | cut -d '/' -f 2 | rev) + +REPO_NAME = 'soykb-generator' +PREFIX = 'hyperflowwms' + +all: push + +container: image + +image: + docker build -t $(PREFIX)/$(REPO_NAME) . # Build new image and automatically tag it as latest + docker tag $(PREFIX)/$(REPO_NAME) $(PREFIX)/$(REPO_NAME):$(TAG) # Add the version tag to the latest image + +push: image + docker push $(PREFIX)/$(REPO_NAME) # Push image tagged as latest to repository + docker push $(PREFIX)/$(REPO_NAME):$(TAG) # Push version tagged image to repository (since this image is already pushed it will simply create or update version tag) + +clean: diff --git a/workflow-generator/Pegasus/AutoADAG.py b/workflow-generator/Pegasus/AutoADAG.py new file mode 100644 index 0000000..ce59c74 --- /dev/null +++ b/workflow-generator/Pegasus/AutoADAG.py @@ -0,0 +1,93 @@ +# # +# Copyright 2007-2012 University Of Southern California +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # + +__author__ = 'Rajiv Mayani' + +import logging + +try: + from Pegasus.DAX3 import ADAG, Job, File, Executable, PFN, Link, When, DuplicateError +except ImportError, e: + logging.error('Include Pegasus Python libraries in your PYTHONPATH') + + +class AutoADAG(object, ADAG): + """ + Automatically determine the dependencies between jobs based on the file usages. + All jobs consuming a file F depend on the singular job that produces that file. + """ + def __init__(self, name, count=None, index=None): + ADAG.__init__(self, name, count, index) + + def writeXML(self, out): + + mapping = {} + + def addOutput(job, file_obj): + + if file_obj: + file_obj = file_obj.name + + if file_obj not in mapping: + mapping[file_obj] = (set(), set()) + + mapping[file_obj][1].add(job) + + # Automatically determine dependencies + + # Traverse each job + for job_id, job in self.jobs.iteritems(): + file_used = job.used + + # If job produces to stdout, identify it as an output file + addOutput(job, job.stdout) + # If job produces to stderr, identify it as an output file + addOutput(job, job.stderr) + + # If job consumes from stdin, identify it as an input file + if job.stdin: + if job.stdin.name not in mapping: + mapping[job.stdin.name] = (set(), set()) + + mapping[job.stdin.name][0].add(job) + + + for file in file_used: + + if file.name not in mapping: + mapping[file.name] = (set(), set()) + + if file.link == Link.INPUT: + mapping[file.name][0].add(job) + else: + mapping[file.name][1].add(job) + + for file_name, io in mapping.iteritems(): + + # Go through the mapping and for each file add dependencies between the + # job producing a file and the jobs consuming the file + inputs = io[0] + + if len(io[1]) > 0: + output = io[1].pop() + + for input in inputs: + try: + self.depends(parent=output, child=input) + except DuplicateError: + pass + + super(AutoADAG, self).writeXML(out) diff --git a/workflow-generator/Pegasus/DAX2.py b/workflow-generator/Pegasus/DAX2.py new file mode 100644 index 0000000..8b930f8 --- /dev/null +++ b/workflow-generator/Pegasus/DAX2.py @@ -0,0 +1,902 @@ +# Copyright 2009 University Of Southern California +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""API for generating Pegasus DAXes + +The classes in this module can be used to generate DAXes that can be +read by Pegasus. + +The official DAX schema is here: http://pegasus.isi.edu/schema/dax-2.1.xsd +""" +from __future__ import print_function + +__author__ = "Gideon Juve " +__all__ = ["DAX","Filename","Profile","Job","Namespace","LFN", + "parse","parseString"] +__version__ = "2.1" + +import datetime, pwd, os +from cStringIO import StringIO +import xml.sax +import xml.sax.handler +import shlex + +SCHEMA_NAMESPACE = u"http://pegasus.isi.edu/schema/DAX" +SCHEMA_LOCATION = u"http://pegasus.isi.edu/schema/dax-2.1.xsd" +SCHEMA_VERSION = u"2.1" + + +class Namespace: + """Namespace values recognized by Pegasus. You can use these, or just + pass your own value when creating a Profile object (see Profile). + """ + + PEGASUS = u'pegasus' + CONDOR = u'condor' + DAGMAN = u'dagman' + ENV = u'env' + HINTS = u'hints' + GLOBUS = u'globus' + SELECTOR = u'selector' + + +class LFN: + """Logical file name attributes. These include: + + Linkage Attributes: + NONE, INPUT, OUTPUT, INOUT + Type Attributes: + TYPE_DATA, TYPE_EXECUTABLE, TYPE_PATTERN + Transfer Attributes: + XFER_NOT, XFER_OPTIONAL, XFER_MANDATORY + """ + + # Linkage + NONE = u'none' + INPUT = u'input' + OUTPUT = u'output' + INOUT = u'inout' + + # File type + TYPE_DATA = u'data' + TYPE_EXECUTABLE = u'executable' + TYPE_PATTERN = u'pattern' + + # Transfer + XFER_NOT = u'false' + XFER_OPTIONAL = u'optional' + XFER_MANDATORY = u'true' + + +class Filename: + """Filename(filename[,type][,link][,register][,transfer][,optional][,varname]) + + A logical file name. + + Examples: + input = Filename('input.txt',link=LFN.INPUT,transfer=True) + intermediate = Filename('intermediate.txt',link=LFN.OUTPUT) + result = Filename('result.txt',link=LFN.OUTPUT,register=True,transfer=True) + opt = Filename('optional.txt',link=LFN.OUTPUT,optional=True) + binary = Filename('bin/binary',link=LFN.INPUT,type=LFN.TYPE_EXECUTABLE,transfer=True) + """ + + def __init__(self, filename, type=LFN.TYPE_DATA, link=LFN.NONE, + register=False, transfer=LFN.XFER_NOT, optional=None, varname=None): + """ + All arguments specify the workflow-level behavior of this Filename. Job-level + behavior can be defined when adding the Filename to a Job's uses. If the + properties are not overridden at the job-level, then the workflow-level + values are used as defaults. + + If this LFN is to be used as a job's stdin/stdout/stderr then the value + of link is ignored when generating the tags. + + Arguments: + filename: The name of the file (required) + type: The file type (see LFN) + link: Is this file a workflow-level input/output/both? (see LFN) + register: The default value for register (True/False) + transfer: The default value for transfer (see LFN, or True/False) + optional: The default value for optional (True/False) + type: The file type (see LFN) + varname: Only used for stdio files + """ + if filename is None: + raise ValueError('filename required') + self.filename = filename + self.link = link + self.register = register + self.transfer = transfer + self.optional = optional + self.type = type + self.varname = varname + + def getFilename(self): + return self.filename + def setFilename(self, filename): + self.filename = filename + def getType(self): + return self.type + def setType(self, type): + self.type = type + def getLink(self): + return self.link + def setLink(self, link): + self.link = link + def getRegister(self): + return self.register + def setRegister(self, register): + self.register = register + def getTransfer(self): + return self.transfer + def setTransfer(self, transfer): + self.transfer = transfer + def getOptional(self): + return self.optional + def setOptional(self, optional): + self.optional = optional + def getVarname(self): + return self.varname + def setVarname(self, varname): + self.varname = varname + + def __str__(self): + """Returns argument-style version of the filename XML tag""" + return self.toArgumentXML() + + def toArgumentXML(self): + """Returns an XML representation of this file as a short filename + tag for use in job arguments""" + return u'' % (self.filename) + + def toFilenameXML(self): + """Returns an XML representation of this file as a filename tag""" + xml = StringIO() + + xml.write(u'') + + result = xml.getvalue() + xml.close() + return result + + def toStdioXML(self, tag): + """Returns an XML representation of this file as a stdin/out/err tag""" + xml = StringIO() + xml.write(u'<%s file="%s"' % (tag, self.filename)) + if self.varname is not None: + xml.write(u' varname="%s"' % self.varname) + if tag is 'stdin': + xml.write(u' link="input"') # stdin is always input + else: + xml.write(u' link="output"') # stdout/stderr are always output + xml.write(u'/>') + + result = xml.getvalue() + xml.close() + return result + + +class Profile: + """Profile(namespace,key,value[,origin]) + + A Profile captures scheduler-, system-, and environment-specific + parameters in a uniform fashion. Each profile declaration assigns a value + to a key within a namespace. The origin records what entity is responsible + for setting the profile and is optional. + + Examples: + path = Profile(Namespace.ENV,'PATH','/bin') + vanilla = Profile(Namespace.CONDOR,'universe','vanilla') + path = Profile(namespace='env',key='PATH',value='/bin') + path = Profile('env','PATH','/bin') + """ + + def __init__(self, namespace, key, value, origin=None): + """ + Arguments: + namespace: The namespace of the profile (see Namespace) + key: The key name. Can be anything that responds to str(). + value: The value for the profile. Can be anything that responds to str(). + origin: The entity responsible for setting this profile (optional) + """ + self.namespace = namespace + self.key = key + self.value = value + self.origin = origin + + def toXML(self): + """Return an XML representation of this profile""" + xml = StringIO() + xml.write(u'') + xml.write(unicode(self.value)) + xml.write(u'') + result = xml.getvalue() + xml.close() + return result + + def __str__(self): + return u'%s:%s = %s' % (self.namespace, self.key, self.value) + + +class Job: + """Job(name[,id][,namespace][,version][,dv_name][,dv_namespace][,dv_version][,level][,compound]) + + This class defines the specifics of a job to run in an abstract manner. + All filename references still refer to logical files. All references + transformations also refer to logical transformations, though + physical location hints can be passed through profiles. + + Examples: + sleep = Job(id="ID0001",name="sleep") + jbsim = Job(id="ID0002",name="jbsim",namespace="cybershake",version="2.1") + merge = Job(name="merge",level=2) + + Several arguments can be added at the same time: + input = Filename(...) + output = Filename(...) + job.addArguments("-i",input,"-o",output) + + Profiles are added similarly: + job.addProfile(Profile(Namespace.ENV,key='PATH',value='/bin')) + + Adding file uses is simple, and you can override global Filename attributes: + job.addUses(input,LFN.INPUT) + job.addUses(output,LFN.OUTPUT,transfer=True,register=True) + """ + + class Use: + """Use(file[,link][,register][,transfer][,optional][,temporaryHint]) + + Use of a logical file name. Used for referencing LFNs in the DAX. + + Note: This class is used internally. You shouldn't need to use it in + your code. You should use Job.addUses(...). + """ + + def __init__(self, file, link=None, register=None, transfer=None, + optional=None, temporaryHint=None): + if file is None: + raise ValueError('file required') + self.file = file + self.link = link + self.optional = optional + self.register = register + self.transfer = transfer + self.temporaryHint = temporaryHint + + def toXML(self): + xml = StringIO() + + if self.link is None: link = self.file.getLink() + else: link = self.link + if self.optional is None: optional = self.file.getOptional() + else: optional = self.optional + if self.register is None: register = self.file.getRegister() + else: register = self.register + if self.transfer is None: transfer = self.file.getTransfer() + else: transfer = self.transfer + type = self.file.getType() + temporaryHint = self.temporaryHint + + xml.write(u'') + + result = xml.getvalue() + xml.close() + return result + + def __init__(self, name, id=None, namespace=None, version=None, + dv_name=None, dv_namespace=None, dv_version=None, + level=None, compound=None): + """The ID for each job should be unique in the DAX. If it is None, then + it will be automatically generated when the job is added to the DAX. + As far as I can tell this ID is only used for uniqueness during + planning, and is otherwise ignored. For example, when Condor is running + the job there doesn't seem to be a way to use this ID to trace the + running job back to its entry in the DAX. + + The name, namespace, and version should match what you have in your + transformation catalog. For example, if namespace="foo" name="bar" + and version="1.0", then the transformation catalog should have an + entry for "foo::bar:1.0". + + Level is the level in the workflow. So if you have a workflow with + three jobs--A, B, and C--and you have dependencies between A->B and + B->C, then A is level 1, B is level 2, and C is level 3. You don't + need to specify this because Pegasus calculates it automatically. + + I have no idea what 'compound' does, or what the 'dv_' stuff does. + + Arguments: + name: The transformation name (required) + id: A unique identifier for the job (autogenerated if None) + namespace: The namespace of the transformation + version: The transformation version + dv_name: ? + dv_namespace: ? + dv_version: ? + level: The level of the job in the workflow + compound: ? + """ + if name is None: + raise ValueError('name required') + self.name = name + self.namespace = namespace + self.version = version + self.id = id + self.dv_namespace = dv_namespace + self.dv_name = dv_name + self.dv_version = dv_version + self.level = level + self.compound = compound + + self.arguments = [] + self.profiles = [] + self.uses = [] + + self.stdout = None + self.stderr = None + self.stdin = None + + + def addArguments(self, *arguments): + """Add several arguments to the job""" + self.arguments.extend(arguments) + + def addArgument(self, arg): + """Add an argument to the job""" + self.addArguments(arg) + + def addProfile(self,profile): + """Add a profile to the job""" + self.profiles.append(profile) + + def addUses(self, file, link=None, register=None, transfer=None, + optional=None, temporaryHint=None): + """Add a logical filename that the job uses. + + Optional arguments to this method specify job-level attributes of + the 'uses' tag in the DAX. If not specified, these values default + to those specified when creating the Filename object. + + I don't know what 'temporaryHint' does. + + Arguments: + file: A Filename object representing the logical file name + link: Is this file a job input, output or both (See LFN) + register: Should this file be registered in RLS? (True/False) + transfer: Should this file be transferred? (True/False or See LFN) + optional: Is this file optional, or should its absence be an error? + temporaryHint: ? + """ + use = Job.Use(file,link,register,transfer,optional) + self.uses.append(use) + + def setStdout(self, filename): + """Redirect stdout to a file""" + self.stdout = filename + + def setStderr(self, filename): + """Redirect stderr to a file""" + self.stderr = filename + + def setStdin(self, filename): + """Redirect stdin from a file""" + self.stdin = filename + + def setID(self, id): + """Set the ID of this job""" + self.id = id + + def getID(self): + """Return the job ID""" + return self.id + + def setNamespace(self, namespace): + """Set the transformation namespace for this job""" + self.namespace = namespace + + def getNamespace(self): + """Get the transformation namespace for this job""" + return self.namespace + + def setName(self, name): + """Set the transformation name of this job""" + self.name = name + + def getName(self): + """Get the transformation name of this job""" + return self.name + + def setVersion(self, version): + """Set the version of the transformation""" + self.version = version + + def getVersion(self): + """Get the version of the transformation""" + return self.version + + def toXML(self,level=0,indent=u'\t'): + """Return an XML representation of this job + + Arguments: + level: The level of indentation + indent: The indentation string + """ + xml = StringIO() + indentation = u''.join(indent for x in range(0,level)) + + # Open tag + xml.write(indentation) + xml.write(u'\n') + + # Arguments + if len(self.arguments) > 0: + xml.write(indentation) + xml.write(indent) + xml.write(u'') + xml.write(u' '.join(unicode(x) for x in self.arguments)) + xml.write(u'\n') + + # Profiles + if len(self.profiles) > 0: + for pro in self.profiles: + xml.write(indentation) + xml.write(indent) + xml.write(u'%s\n' % pro.toXML()) + + # Stdin/xml/err + if self.stdin is not None: + xml.write(indentation) + xml.write(indent) + xml.write(self.stdin.toStdioXML('stdin')) + xml.write(u'\n') + if self.stdout is not None: + xml.write(indentation) + xml.write(indent) + xml.write(self.stdout.toStdioXML('stdout')) + xml.write(u'\n') + if self.stderr is not None: + xml.write(indentation) + xml.write(indent) + xml.write(self.stderr.toStdioXML('stderr')) + xml.write(u'\n') + + # Uses + if len(self.uses) > 0: + for use in self.uses: + xml.write(indentation) + xml.write(indent) + xml.write(use.toXML()) + xml.write(u'\n') + + # Close tag + xml.write(indentation) + xml.write(u'') + + result = xml.getvalue() + xml.close() + return result + + +class DAX: + """DAX(name[,count][,index]) + + Representation of a directed acyclic graph in XML (DAX). + + Examples: + dax = DAX('diamond') + part5 = DAX('partition_5',count=10,index=5) + + Adding jobs: + a = Job(...) + dax.addJob(a) + + Adding parent-child control-flow dependency: + dax.addDependency(a,b) + dax.addDependency(a,c) + dax.addDependency(b,d) + dax.addDependency(c,d) + + Adding Filenames (this is not required to produce a valid DAX): + input = Filename(...) + dax.addFilename(input) + + Writing a DAX out to a file: + f = open('diamond.dax','w') + dax.writeXML(f) + f.close() + """ + + class Dependency: + """A control-flow dependency between a child and its parents""" + def __init__(self,child): + self.child = child + self.parents = [] + + def addParent(self, parent): + self.parents.append(parent) + + def toXML(self, level=0, indent=u'\t'): + xml = StringIO() + indentation = ''.join([indent for x in range(0,level)]) + + xml.write(indentation) + xml.write(u'\n' % self.child.getID()) + for parent in self.parents: + xml.write(indentation) + xml.write(indent) + xml.write(u'\n' % parent.getID()) + xml.write(indentation) + xml.write(u'') + + result = xml.getvalue() + xml.close() + return result + + def __init__(self, name, count=1, index=0): + """ + Arguments: + name: The name of the workflow + count: Total number of DAXes that will be created + index: Zero-based index of this DAX + """ + self.name = name + self.count = count + self.index = index + + # This is used to generate unique ID numbers + self.sequence = 1 + + self.jobs = [] + self.filenames = [] + self.lookup = {} # A lookup table for dependencies + self.dependencies = [] + + def getName(self): + return self.name + + def setName(self,name): + self.name = name + + def getCount(self): + return self.count + + def setCount(self,count): + self.count = count + + def getIndex(self): + return self.index + + def setIndex(self,index): + self.index = index + + def addJob(self,job): + """Add a job to the list of jobs in the DAX""" + # Add an auto-generated ID if the job doesn't have one + if job.getID() is None: + job.setID("ID%07d" % self.sequence) + self.sequence += 1 + self.jobs.append(job) + + def addFilename(self, filename): + """Add a filename""" + self.filenames.append(filename) + + def addDependency(self, parent, child): + """Add a control flow dependency""" + if not child in self.lookup: + dep = DAX.Dependency(child) + self.lookup[child] = dep + self.dependencies.append(dep) + self.lookup[child].addParent(parent) + + def writeXML(self, out, indent='\t'): + """Write the DAX as XML to a stream""" + + # Preamble + out.write(u'\n') + + # Metadata + out.write(u'\n' % datetime.datetime.now()) + out.write(u'\n' % pwd.getpwuid(os.getuid())[0]) + out.write(u'\n') + + # Open tag + out.write(u'\n' % (len(self.jobs), len(self.filenames), len(self.dependencies))) + + # Files + out.write(u'\n%s\n' % indent) + for filename in self.filenames: + out.write(indent) + out.write(filename.toFilenameXML()) + out.write('\n') + + # Jobs + out.write(u'\n%s\n' % indent) + for job in self.jobs: + out.write(job.toXML(level=1,indent=indent)) + out.write(u'\n') + + # Dependencies + out.write(u'\n%s\n' % indent) + for dep in self.dependencies: + out.write(dep.toXML(level=1,indent=indent)) + out.write(u'\n') + + # Close tag + out.write(u'\n') + + +class DAXHandler(xml.sax.handler.ContentHandler): + """ + This is a DAX parser + """ + def __init__(self): + self.dax = None + self.jobmap = {} + self.filemap = {} + self.lastJob = None + self.lastChild = None + self.lastArguments = None + self.lastProfile = None + + def startElement(self, name, attrs): + if name == "adag": + name = attrs.get("name") + count = int(attrs.get("count","1")) + index = int(attrs.get("index","0")) + self.dax = DAX(name,count,index) + elif name == "filename": + if self.lastJob is None: + file = attrs.get("file") + link = attrs.get("link") + optional = attrs.get("optional") + f = Filename(file, type=None, link=link, register=None, + transfer=None, optional=optional) + self.dax.addFilename(f) + self.filemap[name] = f + else: + name = attrs.get("file") + if name in self.filemap: + f = self.filemap[name] + else: + f = Filename(name) + self.filemap[name] = f + if self.lastProfile is None: + self.lastArguments.append(f) + else: + self.lastProfile.value = f + elif name == "job": + id = attrs.get("id") + namespace = attrs.get("namespace") + name = attrs.get("name") + version = attrs.get("version") + dv_namespace = attrs.get("dv-namespace") + dv_name = attrs.get("dv-name") + dv_version = attrs.get("dv-version") + level = attrs.get("level") + compound = attrs.get("compound") + job = Job(id=id, namespace=namespace, name=name, version=version, + dv_namespace=dv_namespace, dv_name=dv_name, dv_version=dv_version, + level=level, compound=compound) + self.dax.addJob(job) + self.lastJob = job + self.jobmap[job.getID()] = job + elif name == "argument": + self.lastArguments = [] + elif name == "profile": + namespace = attrs.get("namespace") + key = attrs.get("key") + self.lastProfile = Profile(namespace,key,"") + self.lastJob.addProfile(self.lastProfile) + elif name in ["stdin","stdout","stderr"]: + file = attrs.get("file") + link = attrs.get("link") + varname = attrs.get("varname") + if file in self.filemap: + f = self.filemap[file] + else: + f = Filename(file,link=link) + self.filemap[file] = f + if varname is not None: + if f.varname is None: + f.varname = varname + if name == "stdin": + self.lastJob.setStdin(f) + elif name == "stdout": + self.lastJob.setStdout(f) + else: + self.lastJob.setStderr(f) + elif name == "uses": + file = attrs.get("file") + link = attrs.get("link") + register = attrs.get("register") + transfer = attrs.get("transfer") + type = attrs.get("type") + temporaryHint = attrs.get("temporaryHint") + if file in self.filemap: + f = self.filemap[file] + if f.type is None: + f.type = type + else: + f = Filename(file, type=type, link=link, + register=register, transfer=transfer) + self.filemap[file] = f + self.lastJob.addUses(f,link=link,register=register, + transfer=transfer,temporaryHint=temporaryHint) + elif name == "child": + id = attrs.get("ref") + child = self.jobmap[id] + self.lastChild = child + elif name == "parent": + id = attrs.get("ref") + parent = self.jobmap[id] + self.dax.addDependency(parent, self.lastChild) + + def characters(self, chars): + if self.lastArguments is not None: + self.lastArguments += [unicode(a) for a in shlex.split(chars)] + elif self.lastProfile is not None: + self.lastProfile.value += chars + + def endElement(self, name): + if name == "child": + self.lastChild = None + elif name == "job": + self.lastJob = None + elif name == "argument": + self.lastJob.addArguments(*self.lastArguments) + self.lastArguments = None + elif name == "profile": + self.lastProfile = None + + +def parse(fname): + """ + Parse DAX from a Pegasus DAX file. + """ + handler = DAXHandler() + xml.sax.parse(fname, handler) + return handler.dax + + +def parseString(string): + """ + Parse DAX from a string + """ + handler = DAXHandler() + xml.sax.parseString(string, handler) + return handler.dax + + +if __name__ == '__main__': + """An example of using the DAX API""" + + # Create a DAX + diamond = DAX("diamond") + + # Create some logical file names + a = Filename("f.a",link=LFN.INPUT,transfer=True) + b1 = Filename("f.b1",link=LFN.OUTPUT,transfer=True) + b2 = Filename("f.b2",link=LFN.OUTPUT,transfer=True) + c1 = Filename("f.c1",link=LFN.OUTPUT,transfer=True) + c2 = Filename("f.c2",link=LFN.OUTPUT,transfer=True) + d = Filename("f.d",link=LFN.OUTPUT,transfer=True,register=True) + + # Add the filenames to the DAX (this is not strictly required) + diamond.addFilename(a) + diamond.addFilename(d) + + # Add a preprocess job + preprocess = Job(namespace="diamond",name="preprocess",version="2.0") + preprocess.addArguments("-a preprocess","-T60","-i",a,"-o",b1,b2) + preprocess.addUses(a,link=LFN.INPUT) + preprocess.addUses(b1,link=LFN.OUTPUT) + preprocess.addUses(b2,link=LFN.OUTPUT) + diamond.addJob(preprocess) + + # Add left Findrange job + frl = Job(namespace="diamond",name="findrange",version="2.0") + frl.addArguments("-a findrange","-T60","-i",b1,"-o",c1) + frl.addUses(b1,link=LFN.INPUT) + frl.addUses(c1,link=LFN.OUTPUT) + diamond.addJob(frl) + + # Add right Findrange job + frr = Job(namespace="diamond",name="findrange",version="2.0") + frr.addArguments("-a findrange","-T60","-i",b2,"-o",c2) + frr.addUses(b2,link=LFN.INPUT) + frr.addUses(c2,link=LFN.OUTPUT) + diamond.addJob(frr) + + # Add Analyze job + analyze = Job(namespace="diamond",name="analyze",version="2.0") + analyze.addArguments("-a analyze","-T60","-i",c1,c2,"-o",d) + analyze.addUses(c1,link=LFN.INPUT) + analyze.addUses(c2,link=LFN.INPUT) + analyze.addUses(d,link=LFN.OUTPUT) + diamond.addJob(analyze) + + # Add control-flow dependencies + diamond.addDependency(parent=preprocess, child=frl) + diamond.addDependency(parent=preprocess, child=frr) + diamond.addDependency(parent=frl, child=analyze) + diamond.addDependency(parent=frr, child=analyze) + + out = StringIO() + diamond.writeXML(out) + foo1 = out.getvalue() + out.close() + + diamond = parseString(foo1) + + out = StringIO() + diamond.writeXML(out) + foo2 = out.getvalue() + out.close() + + print(foo1) + print(foo2) + diff --git a/workflow-generator/Pegasus/DAX3.py b/workflow-generator/Pegasus/DAX3.py new file mode 100644 index 0000000..e41b9be --- /dev/null +++ b/workflow-generator/Pegasus/DAX3.py @@ -0,0 +1,2334 @@ +# Copyright 2010 University Of Southern California +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""API for generating Pegasus DAXes + +The classes in this module can be used to generate DAXes that can be +read by Pegasus. + +The official DAX schema is here: http://pegasus.isi.edu/schema/ + +Here is an example showing how to create the diamond DAX using this API: + +# Create a DAX +diamond = ADAG("diamond") + +# Add some metadata +diamond.metadata("name", "diamond") +diamond.metadata("createdby", "Gideon Juve") + +# Add input file to the DAX-level replica catalog +a = File("f.a") +a.addPFN(PFN("gsiftp://site.com/inputs/f.a","site")) +a.metadata("size", "1024") +diamond.addFile(a) + +# Add executables to the DAX-level replica catalog +e_preprocess = Executable(namespace="diamond", name="preprocess", version="4.0", os="linux", arch="x86_64") +e_preprocess.metadata("size", "2048") +e_preprocess.addPFN(PFN("gsiftp://site.com/bin/preprocess","site")) +diamond.addExecutable(e_preprocess) + +e_findrange = Executable(namespace="diamond", name="findrange", version="4.0", os="linux", arch="x86_64") +e_findrange.addPFN(PFN("gsiftp://site.com/bin/findrange","site")) +diamond.addExecutable(e_findrange) + +e_analyze = Executable(namespace="diamond", name="analyze", version="4.0", os="linux", arch="x86_64") +e_analyze.addPFN(PFN("gsiftp://site.com/bin/analyze","site")) +diamond.addExecutable(e_analyze) + +# Add a preprocess job +preprocess = Job(e_preprocess) +preprocess.metadata("time", "60") +b1 = File("f.b1") +b2 = File("f.b2") +preprocess.addArguments("-a preprocess","-T60","-i",a,"-o",b1,b2) +preprocess.uses(a, link=Link.INPUT) +preprocess.uses(b1, link=Link.OUTPUT, transfer=True) +preprocess.uses(b2, link=Link.OUTPUT, transfer=True) +diamond.addJob(preprocess) + +# Add left Findrange job +frl = Job(e_findrange) +frl.metadata("time", "60") +c1 = File("f.c1") +frl.addArguments("-a findrange","-T60","-i",b1,"-o",c1) +frl.uses(b1, link=Link.INPUT) +frl.uses(c1, link=Link.OUTPUT, transfer=True) +diamond.addJob(frl) + +# Add right Findrange job +frr = Job(e_findrange) +frr.metadata("time", "60") +c2 = File("f.c2") +frr.addArguments("-a findrange","-T60","-i",b2,"-o",c2) +frr.uses(b2, link=Link.INPUT) +frr.uses(c2, link=Link.OUTPUT, transfer=True) +diamond.addJob(frr) + +# Add Analyze job +analyze = Job(e_analyze) +analyze.metadata("time", "60") +d = File("f.d") +analyze.addArguments("-a analyze","-T60","-i",c1,c2,"-o",d) +analyze.uses(c1, link=Link.INPUT) +analyze.uses(c2, link=Link.INPUT) +analyze.uses(d, link=Link.OUTPUT, transfer=True, register=True) +diamond.addJob(analyze) + +# Add dependencies +diamond.depends(parent=preprocess, child=frl) +diamond.depends(parent=preprocess, child=frr) +diamond.depends(parent=frl, child=analyze) +diamond.depends(parent=frr, child=analyze) + +# Write the DAX to stdout +import sys +diamond.writeXML(sys.stdout) + +# Write the DAX to a file +f = open("diamond.dax","w") +diamond.writeXML(f) +f.close() +""" + +__author__ = "Gideon Juve , Rafael Ferreira da Silva " + +__version__ = "3.6" + +__all__ = [ + "DAX3Error", + "DuplicateError", + "NotFoundError", + "FormatError", + "ParseError", + "Element", + "Namespace", + "ContainerType", + "Arch", + "Link", + "Transfer", + "OS", + "When", + "Invoke", + "InvokeMixin", + "ProfileMixin", + "MetadataMixin", + "PFNMixin", + "CatalogType", + "File", + "Executable", + "Container", + "Metadata", + "PFN", + "Profile", + "Use", + "UseMixin", + "Transformation", + "AbstractJob", + "Job", + "DAX", + "DAG", + "Dependency", + "ADAG", + "parseString", + "parse" +] + +import datetime, os, sys +import codecs +import shlex +import codecs +import warnings + +if sys.version_info >= (3, 0): + # compatibility with Python 3 + from past.builtins import basestring + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO + +SCHEMA_NAMESPACE = "http://pegasus.isi.edu/schema/DAX" +SCHEMA_LOCATION = "http://pegasus.isi.edu/schema/dax-3.6.xsd" +SCHEMA_VERSION = "3.6" + + +class DAX3Error(Exception): pass + + +class DuplicateError(DAX3Error): pass + + +class NotFoundError(DAX3Error): pass + + +class FormatError(DAX3Error): pass + + +class ParseError(DAX3Error): pass + + +class Element: + """Representation of an XML element for formatting output""" + + def __init__(self, name, attrs=[]): + self.name = name + self.attrs = [] + for attr, value in attrs: + if value is not None: + if isinstance(value, bool): + value = str(value).lower() + elif not isinstance(value, basestring): + value = repr(value) + attr = attr.replace('__', ':') + self.attrs.append((attr, value)) + self.children = [] + self.flat = False + + def _escape(self, text): + """Escape special characters in XML""" + o = [] + for c in text: + if c == '"': + o.append(""") + elif c == "'": + o.append("'") + elif c == "<": + o.append("<") + elif c == ">": + o.append(">") + elif c == "&": + o.append("&") + else: + o.append(c) + return ''.join(o) + + def element(self, element): + self.children.append(element) + return element + + def text(self, value): + if not isinstance(value, basestring): + value = str(value) + self.children.append(self._escape(value)) + return self + + def comment(self, message): + self.children.append("" % self._escape(message)) + + def flatten(self): + self.flat = True + return self + + def __unicode__(self): + s = StringIO() + self.write(s) + x = s.getvalue() + s.close() + return unicode(x) + + def __str__(self): + return unicode(self).encode('utf-8') + + def write(self, stream=sys.stdout, level=0, flatten=False): + flat = self.flat or flatten + + stream.write('<%s' % self.name) + + for attr, value in self.attrs: + value = self._escape(value) + stream.write(' %s="%s"' % (attr, value)) + + if len(self.children) == 0: + stream.write('/>') + else: + stream.write('>') + if not flat: + stream.write('\n') + for child in self.children: + if not flat: + stream.write('\t' * (level + 1)) + if isinstance(child, basestring): + stream.write(child) + else: + child.write(stream, level + 1, flat) + if not flat: + stream.write('\n') + if not flat: + stream.write('\t' * level) + stream.write('' % self.name) + + +class Namespace: + """ + Namespace values recognized by Pegasus. See Executable, + Transformation, and Job. + """ + PEGASUS = 'pegasus' + CONDOR = 'condor' + DAGMAN = 'dagman' + ENV = 'env' + HINTS = 'hints' + GLOBUS = 'globus' + SELECTOR = 'selector' + STAT = 'stat' + + +class Arch: + """ + Architecture types. See Executable. + """ + X86 = 'x86' + X86_64 = 'x86_64' + PPC = 'ppc' + PPC_64 = 'ppc_64' + IA64 = 'ia64' + SPARCV7 = 'sparcv7' + SPARCV9 = 'sparcv9' + AMD64 = 'amd64' + + +class Link: + """ + Linkage attributes. See File, Executable and uses(). + """ + NONE = 'none' + INPUT = 'input' + OUTPUT = 'output' + INOUT = 'inout' + CHECKPOINT = 'checkpoint' + + +class Transfer: + """ + Transfer types for uses. See Executable, File. + """ + FALSE = 'false' + OPTIONAL = 'optional' + TRUE = 'true' + + +class OS: + """ + OS types. See Executable. + """ + LINUX = 'linux' + SUNOS = 'sunos' + AIX = 'aix' + MACOS = 'macos' + WINDOWS = 'windows' + + +class When: + """ + Job states for notifications. See Job/DAX/DAG.invoke(). + """ + NEVER = 'never' + START = 'start' + ON_ERROR = 'on_error' + ON_SUCCESS = 'on_success' + AT_END = 'at_end' + ALL = 'all' + + +class ContainerType: + """ + Container types. See Container. + """ + DOCKER = 'docker' + SINGULARITY = 'singularity' + + +class Invoke: + def __init__(self, when, what): + if not when: + raise FormatError("invalid when", when) + if not what: + raise FormatError("invalid what", what) + self.when = when + self.what = what + + def __unicode__(self): + return u"" % (self.when, self.what) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __hash__(self): + return hash((self.when, self.what)) + + def __eq__(self, other): + if isinstance(other, Invoke): + return self.when == other.when and self.what == other.what + return False + + def toXML(self): + e = Element('invoke', [('when', self.when)]) + e.text(self.what) + e.flatten() + return e + + +class InvokeMixin: + def addInvoke(self, invoke): + """Add invoke to this object""" + if self.hasInvoke(invoke): + raise DuplicateError("Duplicate Invoke %s" % invoke) + self.invocations.add(invoke) + + def hasInvoke(self, invoke): + """Test to see if this object has invoke""" + return invoke in self.invocations + + def removeInvoke(self, invoke): + """Remove invoke from this object""" + if not self.hasInvoke(invoke): + raise NotFoundError("Invoke not found", invoke) + self.invocations.remove(invoke) + + def clearInvokes(self): + """Remove all Invoke objects""" + self.invocations.clear() + + def invoke(self, when, what): + """ + Invoke executable 'what' when job reaches status 'when'. The value of + 'what' should be a command that can be executed on the submit host. + + The list of valid values for 'when' is: + + WHEN MEANING + ========== ======================================================= + never never invoke + start invoke just before job gets submitted. + on_error invoke after job finishes with failure (exitcode != 0). + on_success invoke after job finishes with success (exitcode == 0). + at_end invoke after job finishes, regardless of exit status. + all like start and at_end combined. + + Examples: + obj.invoke('at_end','/usr/bin/mail -s "job done" juve@usc.edu') + obj.invoke('on_error','/usr/bin/update_db -failure') + """ + self.addInvoke(Invoke(when, what)) + + +class ProfileMixin: + def addProfile(self, profile): + """Add a profile to this object""" + if self.hasProfile(profile): + raise DuplicateError("Duplicate profile %s" % profile) + self.profiles.add(profile) + + def hasProfile(self, profile): + """Does this object have profile?""" + return profile in self.profiles + + def removeProfile(self, profile): + """Remove profile from this object""" + if not self.hasProfile(profile): + raise NotFoundError("Profile not found", profile) + self.profiles.remove(profile) + + def clearProfiles(self): + """Remove all profiles from this object""" + self.profiles.clear() + + def profile(self, namespace, key, value): + """Declarative profile addition""" + self.addProfile(Profile(namespace, key, value)) + + +class MetadataMixin: + def addMetadata(self, metadata): + """Add metadata to this object""" + if self.hasMetadata(metadata): + raise DuplicateError("Duplicate Metadata %s" % metadata) + self._metadata.add(metadata) + + def removeMetadata(self, metadata): + """Remove meta from this object""" + if not self.hasMetadata(metadata): + raise NotFoundError("Metadata not found", metadata) + self._metadata.remove(metadata) + + def hasMetadata(self, metadata): + """Does this object have metadata?""" + return metadata in self._metadata + + def clearMetadata(self): + """Remove all metadata from this object""" + self._metadata.clear() + + def metadata(self, key, value): + """Declarative metadata addition""" + self.addMetadata(Metadata(key, value)) + + +class PFNMixin: + def addPFN(self, pfn): + """Add a PFN to this object""" + if self.hasPFN(pfn): + raise DuplicateError("Duplicate PFN %s" % pfn) + self.pfns.add(pfn) + + def removePFN(self, pfn): + """Remove PFN from this object""" + if not self.hasPFN(pfn): + raise NotFoundError("PFN not found", pfn) + self.pfns.remove(pfn) + + def hasPFN(self, pfn): + """Does this object have pfn?""" + return pfn in self.pfns + + def clearPFNs(self): + """Remove all PFNs from this object""" + self.pfns.clear() + + def PFN(self, url, site=None): + """Declarative PFN addition""" + self.addPFN(PFN(url, site)) + + +class CatalogType(ProfileMixin, MetadataMixin, PFNMixin): + """Base class for File and Executable""" + + def __init__(self, name): + """ + All arguments specify the workflow-level behavior of this File. Job-level + behavior can be defined when adding the File to a Job's uses. If the + properties are not overridden at the job-level, then the workflow-level + values are used as defaults. + + If this LFN is to be used as a job's stdin/stdout/stderr then the value + of link is ignored when generating the tags. + + Arguments: + name: The name of the file (required) + """ + if not name: + raise FormatError('name required') + self.name = name + self.profiles = set() + self._metadata = set() + self.pfns = set() + + def innerXML(self, parent): + for p in self.profiles: + parent.element(p.toXML()) + for m in self._metadata: + parent.element(m.toXML()) + for p in self.pfns: + parent.element(p.toXML()) + + +class File(CatalogType): + """File(name) + + A file entry for the DAX-level replica catalog, or a reference to a logical file + used by the workflow. + + Examples: + input = File('input.txt') + + Example use in job: + input = File('input.txt') + output = File('output.txt') + job = Job(name="compute") + job.uses(input, link=Link.INPUT, transfer=True) + job.uses(output, link=Link.OUTPUT, transfer=True, register=True) + """ + + def __init__(self, name): + """ + All arguments specify the workflow-level behavior of this File. Job-level + behavior can be defined when adding the File to a Job's uses. If the + properties are not overridden at the job-level, then the workflow-level + values are used as defaults. + + If this LFN is to be used as a job's stdin/stdout/stderr then the value + of link is ignored when generating the tags. + + Arguments: + name: The name of the file (required) + """ + CatalogType.__init__(self, name) + + def __unicode__(self): + return u"" % self.name + + def __str__(self): + return unicode(self).encode('utf-8') + + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return isinstance(other, File) and self.name == other.name + + def toArgumentXML(self): + """Returns an XML representation of this File with no inner elements""" + return Element('file', [('name', self.name)]) + + def toStdioXML(self, tag): + """Returns an XML representation of this file as a stdin/out/err tag""" + if tag is 'stdin': + link = "input" # stdin is always input + elif tag in ['stdout', 'stderr']: + link = "output" # stdout/stderr are always output + else: + raise FormatError("invalid tag", tag, "should be one of stdin, stdout, stderr") + + return Element(tag, [ + ('name', self.name), + ('link', link) + ]) + + def toXML(self): + """Return the XML representation of this File with inner elements""" + e = self.toArgumentXML() + self.innerXML(e) + return e + + +class Executable(CatalogType, InvokeMixin): + """Executable(name[,namespace][,version][,arch][,os][,osrelease][,osversion][,glibc][,installed]) + + An entry for an executable in the DAX-level transformation catalog. + + Examples: + grep = Executable("grep") + grep = Executable(namespace="os",name="grep",version="2.3") + grep = Executable(namespace="os",name="grep",version="2.3",arch=Arch.X86) + grep = Executable(namespace="os",name="grep",version="2.3",arch=Arch.X86,os=OS.LINUX) + """ + + def __init__(self, name, namespace=None, version=None, arch=None, os=None, + osrelease=None, osversion=None, glibc=None, installed=None, + container=None): + """ + Arguments: + name: Logical name of executable + namespace: Executable namespace + version: Executable version + arch: Architecture that this exe was compiled for + os: Name of os that this exe was compiled for + osrelease: Release of os that this exe was compiled for + osversion: Version of os that this exe was compiled for + glibc: Version of glibc this exe was compiled against + installed: Is the executable installed (true), or stageable (false) + container: Optional attribute to specify the container to use + """ + CatalogType.__init__(self, name) + self.namespace = namespace + self.version = version + self.arch = arch + self.os = os + self.osrelease = osrelease + self.osversion = osversion + self.glibc = glibc + self.installed = installed + self.container = container + self.invocations = set() + + def __unicode__(self): + return u"" % (self.namespace, self.name, self.version) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __hash__(self): + return hash((self.name, + self.namespace, + self.version, + self.arch, + self.os, + self.osrelease, + self.osversion, + self.glibc, + self.installed, + self.container)) + + def __eq__(self, other): + if isinstance(other, Executable): + return self.name == other.name and \ + self.namespace == other.namespace and \ + self.version == other.version and \ + self.arch == other.arch and \ + self.os == other.os and \ + self.osrelease == other.osrelease and \ + self.osversion == other.osversion and \ + self.glibc == other.glibc and \ + self.installed == other.installed and \ + self.container == other.container + return False + + def toXML(self): + """Returns an XML representation of this file as a filename tag""" + e = Element('executable', [ + ('name', self.name), + ('namespace', self.namespace), + ('version', self.version), + ('arch', self.arch), + ('os', self.os), + ('osrelease', self.osrelease), + ('osversion', self.osversion), + ('glibc', self.glibc), + ('installed', self.installed) + # containers are not support by the DAX3 schema + ]) + self.innerXML(e) + + if self.container: + warnings.warn('The DAX API extensions do not support references for containers.') + + # Invocations + for inv in self.invocations: + e.element(inv.toXML()) + + return e + + +class Container(ProfileMixin): + """Container(name,type,image[,image_site]) + + An entry for a container in the DAX-level transformation catalog. + + Examples: + mycontainer = Container("myapp", type="docker", image="docker:///rynge/montage:latest") + """ + + def __init__(self, name, type, image, imagesite=None, dockerfile=None, mount=None): + """ + Arguments: + name: Container name + type: Container type (see ContainerType) + image: URL to image in a container hub OR URL to an existing container image + imagesite: optional site attribute to tell pegasus which site tar file exist + dockerfile: a url to an existing docker file to build container image from scratch + mount: list of volumes to be mounted + """ + if not name: + raise FormatError("Invalid name", name) + if not type: + raise FormatError("Invalid container type", type) + if not image: + raise FormatError("Invalid image", image) + self.name = name + self.type = type + self.image = image + self.imagesite = imagesite + self.dockerfile = dockerfile + self.mount = mount if mount else [] + self.profiles = set() + + def __unicode__(self): + return u"" % (self.name, self.type) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __hash__(self): + return hash((self.name, + self.type, + self.image, + self.imagesite, + self.dockerfile)) + + def __eq__(self, other): + if isinstance(other, Container): + return self.name == other.name and \ + self.type == other.type and \ + self.image == other.image and \ + self.imagesite == other.imagesite and \ + self.dockerfile == other.dockerfile + return False + + +class Metadata: + """Metadata(key,value) + + A way to add metadata to File and Executable objects. This is + useful if you want to annotate the DAX with things like file + sizes, application-specific attributes, etc. + + There is currently no restriction on the type. + + Examples: + s = Metadata('size','12') + a = Metadata('algorithm','plav') + """ + + def __init__(self, key, value): + """ + Arguments: + key: The key name of the item + value: The value of the item + """ + if not key: + raise FormatError("Invalid key", key) + if not value: + raise FormatError("Invalid value", value) + self.key = key + self.value = value + + def __unicode__(self): + return u"" % (self.key, self.value) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __hash__(self): + return hash(self.key) + + def __eq__(self, other): + return isinstance(other, Metadata) and self.key == other.key + + def toXML(self): + m = Element('metadata', [ + ('key', self.key) + ]) + m.text(self.value).flatten() + return m + + +class PFN(ProfileMixin): + """PFN(url[,site]) + + A physical file name. Used to provide URLs for files and executables + in the DAX-level replica catalog. + + PFNs can be added to File and Executable. + + Examples: + PFN('http://site.com/path/to/file.txt','site') + PFN('http://site.com/path/to/file.txt',site='site') + PFN('http://site.com/path/to/file.txt') + """ + + def __init__(self, url, site=None): + """ + Arguments: + url: The url of the file. + site: The name of the site. [default: local] + """ + if not url: + raise FormatError("Invalid url", url) + if not site: + raise FormatError("Invalid site", site) + self.url = url + self.site = site + self.profiles = set() + + def __unicode__(self): + return u"" % (self.site, self.url) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __hash__(self): + return hash((self.url, self.site)) + + def __eq__(self, other): + return isinstance(other, PFN) and \ + self.url == other.url and \ + self.site == other.site + + def toXML(self): + pfn = Element('pfn', [ + ('url', self.url), + ('site', self.site) + ]) + for p in self.profiles: + pfn.element(p.toXML()) + return pfn + + +class Profile: + """Profile(namespace,key,value) + + A Profile captures scheduler-, system-, and environment-specific + parameters in a uniform fashion. Each profile declaration assigns a value + to a key within a namespace. + + Profiles can be added to Job, DAX, DAG, File, Executable, and PFN. + + Examples: + path = Profile(Namespace.ENV,'PATH','/bin') + vanilla = Profile(Namespace.CONDOR,'universe','vanilla') + path = Profile(namespace='env',key='PATH',value='/bin') + path = Profile('env','PATH','/bin') + """ + + def __init__(self, namespace, key, value): + """ + Arguments: + namespace: The namespace of the profile (see Namespace) + key: The key name. Can be anything that responds to str(). + value: The value for the profile. Can be anything that responds to str(). + """ + self.namespace = namespace + self.key = key + self.value = value + + def __unicode__(self): + return u"" % (self.namespace, self.key, self.value) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __hash__(self): + return hash((self.namespace, self.key)) + + def __eq__(self, other): + return isinstance(other, Profile) and \ + self.namespace == other.namespace and \ + self.key == other.key + + def toXML(self): + """Return an XML element for this profile""" + p = Element("profile", [ + ('namespace', self.namespace), + ('key', self.key) + ]) + p.text(self.value).flatten() + return p + + +class Use(MetadataMixin): + """Use(file[,link][,register][,transfer][,optional] + [,namespace][,version][,executable][,size]) + + Use of a logical file name. Used for referencing files in the DAX. + + Attributes: + file: A string, File or Executable representing the logical file + link: Is this file a job input, output or both (See LFN) (optional) + register: Should this file be registered in RLS? (True/False) (optional) + transfer: Should this file be transferred? (True/False or See LFN) (optional) + optional: Is this file optional, or should its absence be an error? (optional) + namespace: Namespace of executable (optional) + version: version of executable (optional) + executable: Is file an executable? (True/False) (optional) + size: The size of the file (optional) + + For Use objects that are added to Transformations, the attributes 'link', 'register', + 'transfer', 'optional' and 'size' are ignored. + + If a File object is passed in as 'file', then the default value for executable + is 'false'. Similarly, if an Executable object is passed in, then the default + value for executable is 'true'. + """ + + def __init__(self, name, link=None, register=None, transfer=None, + optional=None, namespace=None, version=None, executable=None, + size=None): + if not name: + raise FormatError('Invalid name', name) + + self.name = name + self.link = link + self.optional = optional + self.register = register + self.transfer = transfer + self.namespace = namespace + self.version = version + self.executable = executable + self.size = size + + self._metadata = set() + + def __unicode__(self): + return u"" % (self.namespace, self.name, self.version) + + def __str__(self): + return unicode(self).encode("utf-8") + + def __hash__(self): + return hash((self.namespace, self.name, self.version)) + + def __eq__(self, other): + if isinstance(other, Use): + return self.namespace == other.namespace and \ + self.name == other.name and \ + self.version == other.version + + def toTransformationXML(self): + e = Element('uses', [ + ('namespace', self.namespace), + ('name', self.name), + ('version', self.version), + ('executable', self.executable) + ]) + + for m in self._metadata: + e.element(m.toXML()) + + return e + + def toJobXML(self): + e = Element('uses', [ + ('namespace', self.namespace), + ('name', self.name), + ('version', self.version), + ('link', self.link), + ('register', self.register), + ('transfer', self.transfer), + ('optional', self.optional), + ('executable', self.executable), + ('size', self.size) + ]) + + for m in self._metadata: + e.element(m.toXML()) + + return e + + +class UseMixin: + def addUse(self, use): + """Add Use to this object""" + if self.hasUse(use): + raise DuplicateError("Duplicate Use %s" % use) + self.used.add(use) + + def removeUse(self, use): + """Remove use from this object""" + if not self.hasUse(use): + raise NotFoundError("No such Use", use) + self.used.remove(use) + + def hasUse(self, use): + """Test to see if this object has use""" + return use in self.used + + def clearUses(self): + """Remove all uses from this object""" + self.used.clear() + + def uses(self, arg, link=None, register=None, transfer=None, + optional=None, namespace=None, version=None, executable=None, + size=None): + + if isinstance(arg, CatalogType): + _name = arg.name + else: + _name = arg + + _namespace = None + _version = None + _executable = None + + if isinstance(arg, Executable): + _namespace = arg.namespace + _version = arg.version + # We only need to set this for jobs + # the default is True for Transformations + if isinstance(self, AbstractJob): + _executable = True + + if isinstance(arg, File): + # We only need to set this for transformations + # The default is False for Jobs + if isinstance(self, Transformation): + _executable = False + + if namespace is not None: + _namespace = namespace + if version is not None: + _version = str(version) + if executable is not None: + _executable = executable + + use = Use(_name, link, register, transfer, optional, _namespace, + _version, _executable, size) + + # Copy metadata from File or Executable + # XXX Maybe we only want this if link!=input + if isinstance(arg, CatalogType): + for m in arg._metadata: + use.addMetadata(m) + + self.addUse(use) + + +class Transformation(UseMixin, InvokeMixin, MetadataMixin): + """Transformation((name|executable)[,namespace][,version]) + + A logical transformation. This is basically defining one or more + entries in the transformation catalog. You can think of it like a macro + for adding to your jobs. You can define a transformation that + uses several files and/or executables, and refer to it when creating + a job. If you do, then all of the uses defined for that transformation + will be copied to the job during planning. + + This code: + in = File("input.txt") + exe = Executable("exe") + t = Transformation(namespace="foo", name="bar", version="baz") + t.uses(in) + t.uses(exe) + j = Job(t) + + is equivalent to: + in = File("input.txt") + exe = Executable("exe") + j = Job(namespace="foo", name="bar", version="baz") + j.uses(in) + j.uses(exe) + + Examples: + Transformation(name='mDiff') + Transformation(namespace='montage',name='mDiff') + Transformation(namespace='montage',name='mDiff',version='3.0') + + Using one executable: + mProjectPP = Executable(namespace="montage",name="mProjectPP",version="3.0") + x_mProjectPP = Transformation(mProjectPP) + + Using several executables: + mDiff = Executable(namespace="montage",name="mProjectPP",version="3.0") + mFitplane = Executable(namespace="montage",name="mFitplane",version="3.0") + mDiffFit = Executable(namespace="montage",name="mDiffFit",version="3.0") + x_mDiffFit = Transformation(mDiffFit) + x_mDiffFit.uses(mDiff) + x_mDiffFit.uses(mFitplane) + + Config files too: + conf = File("jbsim.conf") + jbsim = Executable(namespace="scec",name="jbsim") + x_jbsim = Transformation(jbsim) + x_jbsim.uses(conf) + """ + + def __init__(self, name, namespace=None, version=None): + """ + The name argument can be either a string or an Executable object. + If it is an Executable object, then the Transformation inherits + its name, namespace and version from the Executable, and the + Transformation is set to use the Executable with link=input, + transfer=true, and register=False. + + Arguments: + name: The name of the transformation + namespace: The namespace of the xform (optional) + version: The version of the xform (optional) + """ + self.name = None + self.namespace = None + self.version = None + self.used = set() + self.invocations = set() + self._metadata = set() + if isinstance(name, Executable): + self.name = name.name + self.namespace = name.namespace + self.version = name.version + else: + self.name = name + if namespace: self.namespace = namespace + if version: self.version = version + + def __unicode__(self): + return u"" % (self.namespace, self.name, self.version) + + def __str__(self): + return unicode(self).encode("utf-8") + + def __hash__(self): + return hash((self.namespace, self.name, self.version)) + + def __eq__(self, other): + if isinstance(other, Transformation): + return self.namespace == other.namespace and \ + self.name == other.name and \ + self.version == other.version + + def toXML(self): + """Return an XML representation of this transformation""" + e = Element('transformation', [ + ('namespace', self.namespace), + ('name', self.name), + ('version', self.version) + ]) + + # Metadata + for m in self._metadata: + e.element(m.toXML()) + + # Uses + def getlink(a): + if a.link is not None: + return a.link + # Python 3 - make sure we return a string + return "" + + used = list(self.used) + used.sort(key=getlink) + for u in used: + e.element(u.toTransformationXML()) + + # Invocations + for inv in self.invocations: + e.element(inv.toXML()) + + return e + + +class AbstractJob(ProfileMixin, UseMixin, InvokeMixin, MetadataMixin): + """The base class for Job, DAX, and DAG""" + + def __init__(self, id=None, node_label=None): + self.id = id + self.node_label = node_label + + self.arguments = [] + self.profiles = set() + self.used = set() + self.invocations = set() + self._metadata = set() + + self.stdout = None + self.stderr = None + self.stdin = None + + def addArguments(self, *arguments): + """Add one or more arguments to the job (this will add whitespace)""" + for arg in arguments: + if not isinstance(arg, (File, basestring)): + raise FormatError("Invalid argument", arg) + for arg in arguments: + if len(self.arguments) > 0: + self.arguments.append(' ') + self.arguments.append(arg) + + def addRawArguments(self, *arguments): + """Add one or more arguments to the job (whitespace will NOT be added)""" + for arg in arguments: + if not isinstance(arg, (File, basestring)): + raise FormatError("Invalid argument", arg) + self.arguments.extend(arguments) + + def clearArguments(self): + """Remove all arguments from this job""" + self.arguments = [] + + def getArguments(self): + """Get the arguments of this job""" + args = [] + for a in self.arguments: + if isinstance(a, File): + args.append(unicode(a.toArgumentXML())) + else: + args.append(a) + return ''.join(args) + + def setStdout(self, filename): + """Redirect stdout to a file""" + if isinstance(filename, File): + self.stdout = filename + else: + self.stdout = File(filename) + + def clearStdout(self): + """Remove stdout file""" + self.stdout = None + + def setStderr(self, filename): + """Redirect stderr to a file""" + if isinstance(filename, File): + self.stderr = filename + else: + self.stderr = File(filename) + + def clearStderr(self): + """Remove stderr file""" + self.stderr = None + + def setStdin(self, filename): + """Redirect stdin from a file""" + if isinstance(filename, File): + self.stdin = filename + else: + self.stdin = File(filename) + + def clearStdin(self): + """Remove stdin file""" + self.stdin = None + + def innerXML(self, element): + """Return an XML representation of this job""" + # Arguments + if len(self.arguments) > 0: + args = Element('argument').flatten() + for x in self.arguments: + if isinstance(x, File): + args.element(x.toArgumentXML()) + else: + args.text(x) + element.element(args) + + # Metadata + for m in self._metadata: + element.element(m.toXML()) + + # Profiles + for pro in self.profiles: + element.element(pro.toXML()) + + # Stdin/xml/err + if self.stdin is not None: + element.element(self.stdin.toStdioXML('stdin')) + if self.stdout is not None: + element.element(self.stdout.toStdioXML('stdout')) + if self.stderr is not None: + element.element(self.stderr.toStdioXML('stderr')) + + # Uses + def getlink(a): + if a.link is not None: + return a.link + # Python 3 - make sure we return a string + return "" + + used = list(self.used) + used.sort(key=getlink) + for use in used: + element.element(use.toJobXML()) + + # Invocations + for inv in self.invocations: + element.element(inv.toXML()) + + +class Job(AbstractJob): + """Job((name|Executable|Transformation)[,id][,namespace][,version][,node_label]) + + This class defines the specifics of a job to run in an abstract manner. + All filename references still refer to logical files. All references + transformations also refer to logical transformations, though + physical location hints can be passed through profiles. + + Examples: + sleep = Job(id="ID0001",name="sleep") + jbsim = Job(id="ID0002",name="jbsim",namespace="cybershake",version="2.1") + merge = Job("jbsim") + + You can create a Job based on a Transformation: + mDiff_xform = Transformation("mDiff", ...) + mDiff_job = Job(mDiff_xform) + + Or an Executable: + mDiff_exe = Executable("mDiff", ...) + mDiff_job = Job(mDiff_exe) + + Several arguments can be added at the same time: + input = File(...) + output = File(...) + job.addArguments("-i",input,"-o",output) + + Profiles are added similarly: + job.addProfile(Profile(Namespace.ENV, key='PATH', value='/bin')) + job.profile(Namespace.ENV, "PATH", "/bin") + + Adding file uses is simple, and you can override global File attributes: + job.uses(input, Link.INPUT) + job.uses(output, Link.OUTPUT, transfer=True, register=True) + """ + + def __init__(self, name, id=None, namespace=None, version=None, node_label=None): + """The ID for each job should be unique in the DAX. If it is None, then + it will be automatically generated when the job is added to the DAX. + + The name, namespace, and version should match what you have in your + transformation catalog. For example, if namespace="foo" name="bar" + and version="1.0", then the transformation catalog should have an + entry for "foo::bar:1.0". + + The name argument can be either a string, or a Transformation object. If + it is a Transformation object, then the job will inherit the name, namespace, + and version from the Transformation. + + Arguments: + name: The transformation name or Transformation object (required) + id: A unique identifier for the job (optional) + namespace: The namespace of the transformation (optional) + version: The transformation version (optional) + node_label: The label for this job to use in graphing (optional) + """ + self.namespace = None + self.version = None + if isinstance(name, (Transformation, Executable)): + self.name = name.name + self.namespace = name.namespace + self.version = name.version + elif isinstance(name, basestring): + self.name = name + else: + raise FormatError("Name must be a string, Transformation or Executable") + if not self.name: + raise FormatError("Invalid name", self.name) + AbstractJob.__init__(self, id=id, node_label=node_label) + if namespace: self.namespace = namespace + if version: self.version = version + + def __unicode__(self): + return u"" % (self.id, self.namespace, self.name, self.version) + + def __str__(self): + return unicode(self).encode("utf-8") + + def toXML(self): + e = Element('job', [ + ('id', self.id), + ('namespace', self.namespace), + ('name', self.name), + ('version', self.version), + ('node-label', self.node_label) + ]) + self.innerXML(e) + return e + + +class DAX(AbstractJob): + """DAX(file[,id][,node_label]) + + This job represents a sub-DAX that will be planned and executed by + the workflow. + + Examples: + daxjob1 = DAX("foo.dax") + + daxfile = File("foo.dax") + daxjob2 = DAX(daxfile) + """ + + def __init__(self, file, id=None, node_label=None): + """ + + The name argument can be either a string, or a File object. If + it is a File object, then this job will inherit its name from the + File and the File will be added in a with transfer=True, + register=False, and link=input. + + Arguments: + file: The logical name of the DAX file or the DAX File object + id: The id of the DAX job [default: autogenerated] + node_label: The label for this job to use in graphing + """ + if isinstance(file, File): + self.file = file + elif isinstance(file, str) or isinstance(file, unicode): + self.file = File(name=file) + else: + raise FormatError("invalid file", file) + AbstractJob.__init__(self, id=id, node_label=node_label) + + def __unicode__(self): + return u"" % (self.id, self.file.name) + + def __str__(self): + return unicode(self).encode("utf-8") + + def toXML(self): + """Return an XML representation of this job""" + e = Element('dax', [ + ('id', self.id), + ('file', self.file.name), + ('node-label', self.node_label) + ]) + self.innerXML(e) + return e + + +class DAG(AbstractJob): + """DAG(file[,id][,node_label]) + + This job represents a sub-DAG that will be executed by this + workflow. + + Examples: + dagjob1 = DAG(file="foo.dag") + + dagfile = File("foo.dag") + dagjob2 = DAG(dagfile) + """ + + def __init__(self, file, id=None, node_label=None): + """ + The name argument can be either a string, or a File object. If + it is a File object, then this job will inherit its name from the + File and the File will be added in a with transfer=True, + register=False, and link=input. + + Arguments: + file: The logical name of the DAG file, or the DAG File object + id: The ID of the DAG job [default: autogenerated] + node_label: The label for this job to use in graphing + """ + if isinstance(file, File): + self.file = file + elif isinstance(file, str) or isinstance(file, unicode): + self.file = File(name=file) + else: + raise FormatError("Invalid file", file) + AbstractJob.__init__(self, id=id, node_label=node_label) + + def __unicode__(self): + return u"" % (self.id, self.file.name) + + def __str__(self): + return unicode(self).encode("utf-8") + + def toXML(self): + """Return an XML representation of this DAG""" + e = Element('dag', [ + ('id', self.id), + ('file', self.file.name), + ('node-label', self.node_label) + ]) + self.innerXML(e) + return e + + +class Dependency: + """A dependency between two nodes in the ADAG""" + + def __init__(self, parent, child, edge_label=None): + if isinstance(parent, AbstractJob): + if not parent.id: + raise FormatError("Parent job has no id", parent) + self.parent = parent.id + elif parent: + self.parent = parent + else: + raise FormatError("Invalid parent", parent) + if isinstance(child, AbstractJob): + if not child.id: + raise FormatError("Child job has no id", child) + self.child = child.id + elif child: + self.child = child + else: + raise FormatError("Invalid child", child) + if self.parent == self.child: + raise FormatError("No self edges allowed", (self.parent, self.child)) + self.edge_label = edge_label + + def __unicode__(self): + return " %s>" % (self.parent, self.child) + + def __str__(self): + return unicode(self).encode("utf-8") + + def __hash__(self): + return hash((self.parent, self.child)) + + def __eq__(self, other): + """Equal dependencies have the same parent and child""" + if isinstance(other, Dependency): + return self.parent == other.parent and self.child == other.child + return False + + +class ADAG(InvokeMixin, MetadataMixin): + """ADAG(name[,count][,index]) + + Representation of a directed acyclic graph in XML (DAX). + + Examples: + dax = ADAG('diamond') + or, if you want to use the old style count/index partitioning stuff: + part5 = ADAG('partition_5',count=10,index=5) + + Adding jobs: + a = Job(...) + dax.addJob(a) + + Adding parent-child control-flow dependency: + dax.addDependency(Dependency(parent=a,child=b)) + dax.addDependency(Dependency(parent=a,child=c)) + dax.addDependency(Dependency(parent=b,child=d)) + dax.addDependency(Dependency(parent=c,child=d)) + or: + dax.depends(child=b, parent=a) + + Adding Files (not required if you have a replica catalog): + input = File(...) + dax.addFile(input) + + Adding Executables (not required if you have a transformation catalog): + exe = Executable(...) + dax.addExecutable(exe) + + Adding Transformations (not required if you have a transformation catalog): + xform = Transformation(...) + dax.addTransformation(xform) + + Writing a DAX out to a file: + f = open('diamond.dax','w') + dax.writeXML(f) + f.close() + """ + + def __init__(self, name, count=None, index=None, auto=False): + """ + Arguments: + name: The name of the workflow + count: Total number of DAXes that will be created + index: Zero-based index of this DAX + """ + if not name: + raise FormatError("Invalid ADAG name", name) + self.name = name + if count: count = int(count) + if index: index = int(index) + self.count = count + self.index = index + self._auto = auto if auto is True else False + + # This is used to generate unique ID numbers + self.sequence = 1 + + self.jobs = {} + self.files = set() + self.executables = set() + self.dependencies = set() + self.transformations = set() + self.invocations = set() + self._metadata = set() + + # PM-1311 always associate dax.api metadata + self.metadata("dax.api", "python") + + def __unicode__(self): + return u"" % self.name + + def __str__(self): + return unicode(self).encode("utf-8") + + def nextJobID(self): + """Get an autogenerated ID for the next job""" + next = None + while not next or next in self.jobs: + next = "ID%07d" % self.sequence + self.sequence += 1 + return next + + def getJob(self, jobid): + """Get a Job/DAG/DAX""" + if not jobid in self.jobs: + raise NotFoundError("Job not found", jobid) + return self.jobs[jobid] + + def addJob(self, job): + """Add a job to this ADAG""" + # Add an auto-generated ID if the job doesn't have one + if job.id is None: + job.id = self.nextJobID() + if self.hasJob(job): + raise DuplicateError("Duplicate job %s" % job) + self.jobs[job.id] = job + + def hasJob(self, job): + """Test to see if job is in this ADAG + The job parameter can be an object or a job ID + """ + if isinstance(job, AbstractJob): + return job.id in self.jobs + else: + return job in self.jobs + + def removeJob(self, job): + """Remove job from this ADAG""" + if not self.hasJob(job): + raise NotFoundError("Job not found", job) + if isinstance(job, AbstractJob): + del self.jobs[job.id] + else: + del self.jobs[job] + + def clearJobs(self): + """Remove all jobs""" + self.jobs = {} + + def addDAX(self, dax): + """Add a sub-DAX (synonym for addJob)""" + if not isinstance(dax, DAX): + raise FormatError("Not a DAX", dax) + self.addJob(dax) + + def addDAG(self, dag): + """Add a sub-DAG (synonym for addJob)""" + if not isinstance(dag, DAG): + raise FormatError("Not a DAG", dag) + self.addJob(dag) + + def addFile(self, file): + """Add a file to the DAX""" + if not isinstance(file, File): + raise FormatError("Invalid File", file) + if self.hasFile(file): + raise DuplicateError("Duplicate file %s" % file) + self.files.add(file) + + def hasFile(self, file): + """Check to see if file is in this ADAG""" + return file in self.files + + def removeFile(self, file): + """Remove file from this ADAG""" + if not self.hasFile(file): + raise NotFoundError("File not found", file) + self.files.remove(file) + + def clearFiles(self): + """Remove all files""" + self.files.clear() + + def addExecutable(self, executable): + """Add an executable to this ADAG""" + if self.hasExecutable(executable): + raise DuplicateError("Duplicate executable %s" % executable) + self.executables.add(executable) + + def hasExecutable(self, executable): + """Check if executable is in this ADAG""" + return executable in self.executables + + def removeExecutable(self, executable): + """Remove executable from this ADAG""" + if not self.hasExecutable(executable): + raise NotFoundError("Executable not found %s" % executable) + self.executables.remove(executable) + + def clearExecutables(self): + """Remove all executables""" + self.executables.clear() + + def addTransformation(self, transformation): + """Add a transformation to this ADAG""" + if self.hasTransformation(transformation): + raise DuplicateError("Duplicate tranformation %s" % transformation) + self.transformations.add(transformation) + + def hasTransformation(self, transformation): + """Check to see if transformation is in this ADAG""" + return transformation in self.transformations + + def removeTransformation(self, transformation): + """Remove transformation from this ADAG""" + if not self.hasTransformation(transformation): + raise NotFoundError("Transformation not found %s" % transformation) + self.transformations.remove(transformation) + + def clearTransformations(self): + """Remove all transformations""" + self.transformations.clear() + + def depends(self, child, parent, edge_label=None): + """Add a dependency to the workflow + Arguments: + child: The child job/dax/dag or id + parent: The parent job/dax/dag or id + edge_label: A label for the edge (optional) + """ + d = Dependency(parent, child, edge_label) + self.addDependency(d) + + def addDependency(self, dep): + """Add a dependency to the workflow + + The old way to call this method is no longer valid. Please change: + adag.addDependency(parent="ID01", child="ID02", edge_label="E01") + to be: + adag.addDependency(Dependency(parent="ID01", child="ID02", edge_label="E01")) + or: + adag.depends(parent="ID01", child="ID02", edge_label="E01") + + """ + if self.hasDependency(dep): + raise DuplicateError("Duplicate dependency %s" % dep) + # Check the jobs + if dep.parent not in self.jobs: + raise NotFoundError("Parent not found", dep.parent) + if dep.child not in self.jobs: + raise NotFoundError("Child not found", dep.child) + self.dependencies.add(dep) + + def hasDependency(self, dep): + """Check to see if dependency exists""" + return dep in self.dependencies + + def removeDependency(self, dep): + """Remove dependency from workflow""" + if not self.hasDependency(dep): + raise NotFoundError("Dependency not found", dep) + self.dependencies.remove(dep) + + def clearDependencies(self): + """Remove all dependencies""" + self.dependencies.clear() + + def toXML(self): + """Get the XML string for this ADAG + This is primarily intended for testing. If you have a large ADAG + you should use writeXML instead. + """ + s = StringIO() + self.writeXML(s) + xml = s.getvalue() + s.close() + return xml + + def writeXMLFile(self, filename): + """Write the ADAG to an XML file""" + file = codecs.open(filename, "w", "utf-8") + self.writeXML(file) + file.close() + + def _autoDependencies(self): + """Automatically compute job dependencies based on input/output files used by a job""" + if self._auto is False: + return + + mapping = {} + + def addOutput(job, file_obj): + if file_obj: + file_obj = file_obj.name + + if file_obj not in mapping: + mapping[file_obj] = (set(), set()) + + mapping[file_obj][1].add(job) + + # Automatically determine dependencies + + # Traverse each job + for job_id, job in self.jobs.items(): + file_used = job.used + + # If job produces to stdout, identify it as an output file + addOutput(job, job.stdout) + # If job produces to stderr, identify it as an output file + addOutput(job, job.stderr) + + # If job consumes from stdin, identify it as an input file + if job.stdin: + if job.stdin.name not in mapping: + mapping[job.stdin.name] = (set(), set()) + + mapping[job.stdin.name][0].add(job) + + for f in file_used: + if f.name not in mapping: + mapping[f.name] = (set(), set()) + + if f.link == Link.INPUT: + mapping[f.name][0].add(job) + else: + mapping[f.name][1].add(job) + + for file_name, io in mapping.items(): + # Go through the mapping and for each file add dependencies between the + # job producing a file and the jobs consuming the file + inputs = io[0] + + if len(io[1]) > 0: + output = io[1].pop() + + for _input in inputs: + try: + self.depends(parent=output, child=_input) + except DuplicateError: + pass + + def writeXML(self, out): + """Write the ADAG as XML to a stream""" + self._autoDependencies() + + # Preamble + out.write('\n') + + out.write('\n' % datetime.datetime.now()) + if os.name == 'posix': + import pwd + username = pwd.getpwuid(os.getuid())[0] + elif os.name == 'nt': + username = os.getenv("USERNAME", "N/A") + else: + username = "N/A" + out.write('\n' % username) + out.write('\n') + + # Open tag + out.write('\n') + + # Metadata + for m in self._metadata: + out.write('\t') + m.toXML().write(stream=out, level=1) + out.write('\n') + + # Invocations + for i in self.invocations: + out.write('\t') + i.toXML().write(stream=out, level=1) + out.write('\n') + + # Files + for f in self.files: + out.write('\t') + f.toXML().write(stream=out, level=1) + out.write('\n') + + # Executables + for e in self.executables: + out.write('\t') + e.toXML().write(stream=out, level=1) + out.write('\n') + + # Transformations + for t in self.transformations: + out.write('\t') + t.toXML().write(stream=out, level=1) + out.write('\n') + + # Jobs + keys = self.jobs.keys() + keys = sorted(keys) + for job_id in keys: + job = self.jobs[job_id] + out.write('\t') + job.toXML().write(stream=out, level=1) + out.write('\n') + + # Dependencies + # Since we store dependencies as tuples, but we need to print them as nested elements + # we first build a map of all the children that maps child -> [(parent,label),...] + children = {} + for dep in self.dependencies: + if not dep.child in children: + children[dep.child] = [] + children[dep.child].append((dep.parent, dep.edge_label)) + + # Now output all the xml in sorted order by child, then parent + keys = children.keys() + keys = sorted(keys) + for child in keys: + out.write('\t') + c = Element("child", [("ref", child)]) + parents = children[child] + parents = sorted(parents) + for parent, edge_label in parents: + p = Element("parent", [ + ("ref", parent), + ("edge-label", edge_label) + ]) + c.element(p) + c.write(stream=out, level=1) + out.write('\n') + + # Close tag + out.write('\n') + + +def parseString(string): + s = StringIO(string) + return parse(s) + + +def parse(infile): + try: + import xml.etree.cElementTree as etree + except: + try: + import xml.etree.ElementTree as etree + except: + try: + import elementtree.ElementTree as etree + except: + raise Exception("Please install elementtree") + + NS = "{http://pegasus.isi.edu/schema/DAX}" + + def QN(tag): + return NS + tag + + def badattr(e, exc): + return ParseError("Attribute '%s' is required for element %s" % (exc.args[0], e.tag)) + + def parse_invoke(e): + try: + return Invoke(when=e.attrib["when"], what=e.text) + except KeyError as ke: + raise badattr(e, ke) + + def parse_adag(e): + try: + name = e.attrib['name'] + count = e.get("count", None) + index = e.get("index", None) + return ADAG(name=name, count=count, index=index) + except KeyError as ke: + raise badattr(e, ke) + + def parse_profile(e): + try: + return Profile( + namespace=e.attrib["namespace"], + key=e.attrib["key"], + value=e.text) + except KeyError as ke: + raise badattr(e, ke) + + def parse_metadata(e): + try: + return Metadata( + key=e.attrib['key'], + value=e.text) + except KeyError as ke: + raise badattr(e, ke) + + def parse_pfn(e): + try: + p = PFN( + url=e.attrib['url'], + site=e.get("site", None) + ) + except KeyError as ke: + raise badattr(e, ke) + for pr in e.findall(QN("profile")): + p.addProfile(parse_profile(pr)) + return p + + def parse_catalog(e, f): + for p in e.findall(QN("profile")): + f.addProfile(parse_profile(p)) + for m in e.findall(QN("metadata")): + f.addMetadata(parse_metadata(m)) + for p in e.findall(QN("pfn")): + f.addPFN(parse_pfn(p)) + return f + + def parse_file(e): + try: + f = File(e.attrib['name']) + except KeyError as ke: + raise badattr(e, ke) + return parse_catalog(e, f) + + def parse_executable(e): + try: + exe = Executable( + name=e.attrib['name'], + namespace=e.get("namespace", None), + version=e.get("version", None), + arch=e.get("arch", None), + os=e.get("os", None), + osrelease=e.get("osrelease", None), + osversion=e.get("osversion", None), + glibc=e.get("glibc", None), + installed=e.get("installed", None) + ) + except KeyError as ke: + raise badattr(e, ke) + parse_catalog(e, exe) + for i in e.findall(QN("invoke")): + exe.addInvoke(parse_invoke(i)) + return exe + + def parse_uses(e): + try: + u = Use( + e.attrib['name'], + namespace=e.get('namespace', None), + version=e.get('version', None), + link=e.get('link', None), + register=e.get('register', None), + transfer=e.get('transfer', None), + optional=e.get('optional', None), + executable=e.get('executable', None) + ) + except KeyError as ke: + raise badattr(e, ke) + for m in e.findall(QN("metadata")): + u.addMetadata(parse_metadata(m)) + return u + + def parse_transformation(e): + try: + t = Transformation( + namespace=e.get("namespace", None), + name=e.attrib['name'], + version=e.get("version", None)) + except KeyError as ke: + raise badattr(e, ke) + for u in e.findall(QN("uses")): + t.addUse(parse_uses(u)) + for i in e.findall(QN("invoke")): + t.addInvoke(parse_invoke(i)) + for m in e.findall(QN("metadata")): + t.addMetadata(parse_metadata(m)) + return t + + def iterelem(e): + if e.text: + yield e.text + for f in e: + if f.text: + yield f.text + yield f + if f.tail: + yield f.tail + + def parse_absjob(e, j): + args = e.find(QN("argument")) + if args is not None: + for i in iterelem(args): + if isinstance(i, basestring): + j.addRawArguments(i) + else: + j.addRawArguments(File(i.attrib['name'])) + + try: + s = e.find(QN("stdin")) + if s is not None: + j.setStdin(s.attrib['name']) + + s = e.find(QN("stdout")) + if s is not None: + j.setStdout(s.attrib['name']) + + s = e.find(QN("stderr")) + if s is not None: + j.setStderr(s.attrib['name']) + except KeyError as ke: + raise badattr(s, ke) + + for p in e.findall(QN("profile")): + j.addProfile(parse_profile(p)) + + for u in e.findall(QN("uses")): + j.addUse(parse_uses(u)) + + for i in e.findall(QN("invoke")): + j.addInvoke(parse_invoke(i)) + + for m in e.findall(QN("metadata")): + j.addMetadata(parse_metadata(m)) + + return j + + def parse_job(e): + try: + j = Job( + name=e.attrib["name"], + id=e.attrib["id"], + namespace=e.get("namespace", None), + version=e.get("version", None), + node_label=e.get("node-label", None) + ) + except KeyError as ke: + raise badattr(e, ke) + return parse_absjob(e, j) + + def parse_dax(e): + try: + d = DAX( + file=e.attrib["file"], + id=e.attrib["id"], + node_label=e.get("node-label", None) + ) + except KeyError as ke: + raise badattr(e, ke) + return parse_absjob(e, d) + + def parse_dag(e): + try: + d = DAG( + file=e.attrib["file"], + id=e.attrib["id"], + node_label=e.get("node-label", None) + ) + except KeyError as ke: + raise badattr(e, ke) + return parse_absjob(e, d) + + def parse_dependencies(e): + try: + child = e.attrib["ref"] + except KeyError as ke: + raise badattr(e, ke) + for p in e.findall(QN("parent")): + try: + parent = p.attrib["ref"] + label = p.attrib.get("edge-label", None) + yield Dependency(parent, child, label) + except KeyError as ke: + raise badattr(p, ke) + + # We use iterparse because we don't have to read in the + # entire document + iterator = etree.iterparse(infile, events=("start", "end")) + iterator = iter(iterator) + + # Get the document element (should be ) + event, root = next(iterator) + adag = parse_adag(root) + + # This function reads all the children of "node" + def expand(node): + event, elem = next(iterator) + while elem != node: + event, elem = next(iterator) + + # We clear the document element to prevent + # the memory usage from growing + root.clear() + + for ev, elem in iterator: + if ev == "end": + continue + + # Read in the entire element and children + expand(elem) + + if elem.tag == QN("job"): + j = parse_job(elem) + adag.addJob(j) + elif elem.tag == QN("child"): + for d in parse_dependencies(elem): + adag.addDependency(d) + elif elem.tag == QN("file"): + f = parse_file(elem) + adag.addFile(f) + elif elem.tag == QN("executable"): + e = parse_executable(elem) + adag.addExecutable(e) + elif elem.tag == QN("transformation"): + t = parse_transformation(elem) + adag.addTransformation(t) + elif elem.tag == QN("dag"): + d = parse_dag(elem) + adag.addJob(d) + elif elem.tag == QN("dax"): + d = parse_dax(elem) + adag.addJob(d) + elif elem.tag == QN("invoke"): + adag.addInvoke(parse_invoke(elem)) + elif elem.tag == QN("metadata"): + adag.addMetadata(parse_metadata(elem)) + else: + raise ParseError("Unknown tag", elem.tag) + + return adag + + +def main(): + """Simple smoke test""" + # Create a DAX + diamond = ADAG("diamond") + + # Add some metadata + diamond.metadata("name", "diamond") + diamond.metadata("createdby", "Gideon Juve") + + # add some invoke condition + diamond.invoke('on_error', '/usr/bin/update_db -failure') + + # Add input file to the DAX-level replica catalog + a = File("f.a") + a.addPFN(PFN("gsiftp://site.com/inputs/f.a", "site")) + a.metadata("size", "1024") + diamond.addFile(a) + + # Add executables to the DAX-level replica catalog + e_preprocess = Executable(namespace="diamond", name="preprocess", version="4.0", os="linux", arch="x86_64") + e_preprocess.metadata("size", "2048") + e_preprocess.addPFN(PFN("gsiftp://site.com/bin/preprocess", "site")) + diamond.addExecutable(e_preprocess) + + e_findrange = Executable(namespace="diamond", name="findrange", version="4.0", os="linux", arch="x86_64") + e_findrange.addPFN(PFN("gsiftp://site.com/bin/findrange", "site")) + diamond.addExecutable(e_findrange) + + e_analyze = Executable(namespace="diamond", name="analyze", version="4.0", os="linux", arch="x86_64") + e_analyze.addPFN(PFN("gsiftp://site.com/bin/analyze", "site")) + e_analyze.addProfile(Profile(namespace="env", key="APP_HOME", value="/app")) + diamond.addExecutable(e_analyze) + + # Add a preprocess job + preprocess = Job(e_preprocess) + preprocess.metadata("time", "60") + b1 = File("f.b1") + b2 = File("f.b2") + preprocess.addArguments("-a preprocess", "-T60", "-i", a, "-o", b1, b2) + preprocess.uses(a, link=Link.INPUT) + preprocess.uses(b1, link=Link.OUTPUT, transfer=True) + preprocess.uses(b2, link=Link.OUTPUT, transfer=True) + diamond.addJob(preprocess) + + # Add left Findrange job + frl = Job(e_findrange) + frl.metadata("time", "60") + c1 = File("f.c1") + frl.addArguments("-a findrange", "-T60", "-i", b1, "-o", c1) + frl.uses(b1, link=Link.INPUT) + frl.uses(c1, link=Link.OUTPUT, transfer=True) + diamond.addJob(frl) + + # Add right Findrange job + frr = Job(e_findrange) + frr.metadata("time", "60") + c2 = File("f.c2") + frr.addArguments("-a findrange", "-T60", "-i", b2, "-o", c2) + frr.uses(b2, link=Link.INPUT) + frr.uses(c2, link=Link.OUTPUT, transfer=True) + diamond.addJob(frr) + + # Add Analyze job + analyze = Job(e_analyze) + analyze.metadata("time", "60") + d = File("f.d") + analyze.addArguments("-a analyze", "-T60", "-i", c1, c2, "-o", d) + analyze.uses(c1, link=Link.INPUT) + analyze.uses(c2, link=Link.INPUT) + analyze.uses(d, link=Link.OUTPUT, transfer=True, register=True) + diamond.addJob(analyze) + + # Add dependencies + diamond.depends(parent=preprocess, child=frl) + diamond.depends(parent=preprocess, child=frr) + diamond.depends(parent=frl, child=analyze) + diamond.depends(parent=frr, child=analyze) + + # Get generated diamond dax + import sys + diamond.writeXML(sys.stdout) + + +if __name__ == '__main__': + main() diff --git a/workflow-generator/Pegasus/__init__.py b/workflow-generator/Pegasus/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/README.md b/workflow-generator/README.md new file mode 100644 index 0000000..7bc68c0 --- /dev/null +++ b/workflow-generator/README.md @@ -0,0 +1 @@ +docker run -v $PWD:/workdir hyperflowwms/soykb-generator sh -c 'generate-workflow 2' diff --git a/workflow-generator/chromosomes.txt b/workflow-generator/chromosomes.txt new file mode 100644 index 0000000..76c659e --- /dev/null +++ b/workflow-generator/chromosomes.txt @@ -0,0 +1,20 @@ +>Chr01 +>Chr02 +>Chr03 +>Chr04 +>Chr05 +>Chr06 +>Chr07 +>Chr08 +>Chr09 +>Chr10 +>Chr11 +>Chr12 +>Chr13 +>Chr14 +>Chr15 +>Chr16 +>Chr17 +>Chr18 +>Chr19 +>Chr20 diff --git a/workflow-generator/conf/.soybean-workflow.conf b/workflow-generator/conf/.soybean-workflow.conf new file mode 100644 index 0000000..bbd4106 --- /dev/null +++ b/workflow-generator/conf/.soybean-workflow.conf @@ -0,0 +1,26 @@ +# local refers to the submit host. Specify paths to a directory +# which can be used by the workflow as work space, and locations +# for local software installs. +[local] + +work_dir = data + +irods_bin = irods_bin + +# tacc refers to configuration for the TACC Stampede +# supercomputer. To use this machine, you need an allocation +# (start with TG-) and you also need to know your username +# and storage group name for the system. The easiest way to +# obtain those is to log into the system, and run: +# cds; pwd +# This should return a path like: /scratch/00384/rynge. The +# storage group is the second level, and your username is +# last level. +[tacc] + +allocation = TG-ABC1234 + +username = rynge + +storage_group = 00384 + diff --git a/workflow-generator/conf/distributed/pegasus.conf b/workflow-generator/conf/distributed/pegasus.conf new file mode 100644 index 0000000..3ee6a94 --- /dev/null +++ b/workflow-generator/conf/distributed/pegasus.conf @@ -0,0 +1,22 @@ +pegasus.metrics.app = Soykb + +pegasus.catalog.site.file = sites.catalog + +pegasus.catalog.transformation.file = transformations.catalog + +pegasus.catalog.replica = File +pegasus.catalog.replica.file = replica.catalog + +pegasus.dir.useTimestamp = true +pegasus.dir.storage.mapper = Flat +pegasus.dir.storage.deep = true +pegasus.condor.logs.symlink = false + +pegasus.data.configuration = nonsharedfs + +pegasus.transfer.threads = 4 +pegasus.transfer.lite.threads = 4 +pegasus.stagein.clusters = 2 +pegasus.stageout.clusters = 2 + + diff --git a/workflow-generator/conf/distributed/replica.catalog b/workflow-generator/conf/distributed/replica.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/conf/distributed/site.conf b/workflow-generator/conf/distributed/site.conf new file mode 100644 index 0000000..84d2139 --- /dev/null +++ b/workflow-generator/conf/distributed/site.conf @@ -0,0 +1,11 @@ + + +[exec_environment] + +staging_site = isi_workflow + +output_site = isi_workflow + +job_clustering = + + diff --git a/workflow-generator/conf/distributed/sites.catalog.template b/workflow-generator/conf/distributed/sites.catalog.template new file mode 100644 index 0000000..fd80d75 --- /dev/null +++ b/workflow-generator/conf/distributed/sites.catalog.template @@ -0,0 +1,39 @@ + + + + + + + + + + $pegasus_bin:$irods_bin:/usr/bin:/bin + $home/.ssh/workflow + + + condor + vanilla + isUndefined(GLIDEIN_Entry_Name) + "SoyKB" + $irods_bin:/usr/bin:/bin + /tmp + + + + + + $home/irods.iplant.env + + + + + + + + + + + $home/.ssh/workflow + + + diff --git a/workflow-generator/conf/distributed/transformations.catalog b/workflow-generator/conf/distributed/transformations.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/conf/main.conf b/workflow-generator/conf/main.conf new file mode 100644 index 0000000..d32b9cf --- /dev/null +++ b/workflow-generator/conf/main.conf @@ -0,0 +1,11 @@ + +[main] + +# single-end or pair-end +inputs-style = pair-end + +# example: QD < 2.0 || FS > 200.0 || MQ < 40 || Haplotypescore > 20.0 +snp_filter = QD < 2.0 || FS > 60.0 || MQ < 40.0 +indel_filter = QD < 2.0 || FS > 200.0 || MQ < 40 + + diff --git a/workflow-generator/conf/missouri/pegasus.conf b/workflow-generator/conf/missouri/pegasus.conf new file mode 100644 index 0000000..3ee6a94 --- /dev/null +++ b/workflow-generator/conf/missouri/pegasus.conf @@ -0,0 +1,22 @@ +pegasus.metrics.app = Soykb + +pegasus.catalog.site.file = sites.catalog + +pegasus.catalog.transformation.file = transformations.catalog + +pegasus.catalog.replica = File +pegasus.catalog.replica.file = replica.catalog + +pegasus.dir.useTimestamp = true +pegasus.dir.storage.mapper = Flat +pegasus.dir.storage.deep = true +pegasus.condor.logs.symlink = false + +pegasus.data.configuration = nonsharedfs + +pegasus.transfer.threads = 4 +pegasus.transfer.lite.threads = 4 +pegasus.stagein.clusters = 2 +pegasus.stageout.clusters = 2 + + diff --git a/workflow-generator/conf/missouri/replica.catalog b/workflow-generator/conf/missouri/replica.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/conf/missouri/site.conf b/workflow-generator/conf/missouri/site.conf new file mode 100644 index 0000000..f88a486 --- /dev/null +++ b/workflow-generator/conf/missouri/site.conf @@ -0,0 +1,11 @@ + + +[exec_environment] + +staging_site = staging + +output_site = local + +job_clustering = + + diff --git a/workflow-generator/conf/missouri/sites.catalog.template b/workflow-generator/conf/missouri/sites.catalog.template new file mode 100644 index 0000000..359e79b --- /dev/null +++ b/workflow-generator/conf/missouri/sites.catalog.template @@ -0,0 +1,31 @@ + + + + + + + + + + $pegasus_bin:$irods_bin:/usr/bin:/bin + $home/.ssh/workflow + + + condor + vanilla + /opt/java/jdk1.7.0_09/bin:/usr/bin:/bin + + + + + + $home/irods.iplant.json + + + + + + $home/.ssh/workflow + + + diff --git a/workflow-generator/conf/missouri/transformations.catalog b/workflow-generator/conf/missouri/transformations.catalog new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/workflow-generator/conf/missouri/transformations.catalog @@ -0,0 +1 @@ + diff --git a/workflow-generator/conf/psc-bridges/README.md b/workflow-generator/conf/psc-bridges/README.md new file mode 100644 index 0000000..84ea990 --- /dev/null +++ b/workflow-generator/conf/psc-bridges/README.md @@ -0,0 +1,43 @@ + + +## Editing sites.catalog.template + +Please note that `sites.catalog.template` needs to be updated based on the user who will run the glideins on PSC Bridges. The section which needs to be updated is: + + + + + + +Determine the shared directory assigned to you on Bridges by logging in and running `echo $SCRATCH`. Update the two paths in the section above such that they have your scratch directory, plus `/workflow-runs`. Do _not_ use environment variables here - only full expanded paths are allowed. + + +## Glideins on PSC Bridges + +This setup is based on a PyGlidein setup (https://pegasus.isi.edu/documentation/pyglidein.php) + +To get setup up, first log in to your PSC Bridges account, and then copy the configuration to your home directory: + + $ cd ~ + $ cp ~rynge/rnaseq ~/ + +Edit `~/rnaseq/config/rnaseq-bridges.config`. And the minimum, change `user` and the location of `tarball`, replacing `rynge` with your username. Also update the `#SBATCH --account=` line with a project you want the glidein to charge to. + +Set up the Python virtual environment, and try submitting your first glidein (assuming you already have a workflow submitted on workflow.isi.edu - pyglidein will check for demand before submitting new glideins): + + $ module load python2 + $ cd ~/rnaseq/ + $ . venv/bin/activate + $ pyglidein_client --config=$HOME/rnaseq/config/rnaseq-bridges.config --secrets=$HOME/rnaseq/config/secrets + +The output should state that a glidein was submitted: + + 2018-08-29 17:29:55,788 DEBUG {u'count': 1, u'cpus': 1, u'memory': 0, u'gpus': 0, u'disk': 0.001, u'os': None} + 2018-08-29 17:29:55,788 DEBUG {u'jsonrpc': u'2.0', u'result': [{u'count': 1, u'cpus': 1, u'memory': 0, u'gpus': 0, u'disk': 0.001, u'os': None}], u'id': 0} + Submitted batch job 3865303 + 2018-08-29 17:29:55,846 INFO launched 1 glideins on RM + +After a few minutes, you should be able to see the glidein by running `condor_status` on `workflow.isi.edu`. + +PSC Bridges no longer allows cron jobs, so you have to use the pyglidein_client to start glideins manually. + diff --git a/workflow-generator/conf/psc-bridges/pegasus.conf b/workflow-generator/conf/psc-bridges/pegasus.conf new file mode 100644 index 0000000..4a97249 --- /dev/null +++ b/workflow-generator/conf/psc-bridges/pegasus.conf @@ -0,0 +1,19 @@ +pegasus.metrics.app = Soykb + +pegasus.catalog.site.file = sites.catalog + +pegasus.catalog.transformation.file = transformations.catalog + +pegasus.catalog.replica = File +pegasus.catalog.replica.file = replica.catalog + +pegasus.dir.useTimestamp = true +pegasus.dir.storage.mapper = Flat +pegasus.dir.storage.deep = true +pegasus.condor.logs.symlink = false + +pegasus.data.configuration = sharedfs + +pegasus.transfer.*.remote.sites = execution + + diff --git a/workflow-generator/conf/psc-bridges/replica.catalog b/workflow-generator/conf/psc-bridges/replica.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/conf/psc-bridges/site.conf b/workflow-generator/conf/psc-bridges/site.conf new file mode 100644 index 0000000..a5c3b2e --- /dev/null +++ b/workflow-generator/conf/psc-bridges/site.conf @@ -0,0 +1,11 @@ + + +[exec_environment] + +staging_site = + +output_site = isi_workflow + +job_clustering = + + diff --git a/workflow-generator/conf/psc-bridges/sites.catalog.template b/workflow-generator/conf/psc-bridges/sites.catalog.template new file mode 100644 index 0000000..1ab4ebf --- /dev/null +++ b/workflow-generator/conf/psc-bridges/sites.catalog.template @@ -0,0 +1,48 @@ + + + + + + + + + + + $pegasus_bin:$irods_bin:/usr/bin:/bin + $home/.ssh/workflow + + + + + + + condor + vanilla + regexp("psc.edu", TARGET.FileSystemDomain) + TimeToLive + True + /home/rynge/software/pegasus/pegasus-4.8.3 + + + + + + + + $home/irods.iplant.json + + + + + + + + + + + + $home/.ssh/workflow + + + + diff --git a/workflow-generator/conf/psc-bridges/transformations.catalog b/workflow-generator/conf/psc-bridges/transformations.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/conf/rc4/pegasus.conf b/workflow-generator/conf/rc4/pegasus.conf new file mode 100644 index 0000000..3ee6a94 --- /dev/null +++ b/workflow-generator/conf/rc4/pegasus.conf @@ -0,0 +1,22 @@ +pegasus.metrics.app = Soykb + +pegasus.catalog.site.file = sites.catalog + +pegasus.catalog.transformation.file = transformations.catalog + +pegasus.catalog.replica = File +pegasus.catalog.replica.file = replica.catalog + +pegasus.dir.useTimestamp = true +pegasus.dir.storage.mapper = Flat +pegasus.dir.storage.deep = true +pegasus.condor.logs.symlink = false + +pegasus.data.configuration = nonsharedfs + +pegasus.transfer.threads = 4 +pegasus.transfer.lite.threads = 4 +pegasus.stagein.clusters = 2 +pegasus.stageout.clusters = 2 + + diff --git a/workflow-generator/conf/rc4/replica.catalog b/workflow-generator/conf/rc4/replica.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/conf/rc4/site.conf b/workflow-generator/conf/rc4/site.conf new file mode 100644 index 0000000..3d5e3de --- /dev/null +++ b/workflow-generator/conf/rc4/site.conf @@ -0,0 +1,11 @@ + + +[exec_environment] + +staging_site = execution + +output_site = local + +job_clustering = + + diff --git a/workflow-generator/conf/rc4/sites.catalog.template b/workflow-generator/conf/rc4/sites.catalog.template new file mode 100644 index 0000000..3069600 --- /dev/null +++ b/workflow-generator/conf/rc4/sites.catalog.template @@ -0,0 +1,32 @@ + + + + + + + + + + $pegasus_bin:$irods_bin:/usr/bin:/bin + $home/.ssh/workflow + + + + + + glite + true + batch slurm + /local/scratch/$username + /local/scratch/$username + /home/rynge/software/jdk1.7.0_09/bin:/usr/bin:/bin + /home/rynge/software/pegasus-4.6.2dev + + + + + + $home/irods.iplant.json + + + diff --git a/workflow-generator/conf/rc4/transformations.catalog b/workflow-generator/conf/rc4/transformations.catalog new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/workflow-generator/conf/rc4/transformations.catalog @@ -0,0 +1 @@ + diff --git a/workflow-generator/conf/tacc-stampede/pegasus.conf b/workflow-generator/conf/tacc-stampede/pegasus.conf new file mode 100644 index 0000000..d9a1e96 --- /dev/null +++ b/workflow-generator/conf/tacc-stampede/pegasus.conf @@ -0,0 +1,21 @@ +pegasus.metrics.app = Soykb + +pegasus.catalog.site.file = sites.catalog + +pegasus.catalog.transformation.file = transformations.catalog + +pegasus.catalog.replica = File +pegasus.catalog.replica.file = replica.catalog + +pegasus.dir.useTimestamp = true +pegasus.dir.storage.mapper = Flat +pegasus.dir.storage.deep = true +pegasus.condor.logs.symlink = false + +pegasus.data.configuration = sharedfs + +pegasus.transfer.threads = 2 +pegasus.transfer.lite.threads = 8 +pegasus.stagein.clusters = 3 +pegasus.stageout.clusters = 4 + diff --git a/workflow-generator/conf/tacc-stampede/replica.catalog b/workflow-generator/conf/tacc-stampede/replica.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/conf/tacc-stampede/site.conf b/workflow-generator/conf/tacc-stampede/site.conf new file mode 100644 index 0000000..2fc1abc --- /dev/null +++ b/workflow-generator/conf/tacc-stampede/site.conf @@ -0,0 +1,11 @@ + + +[exec_environment] + +staging_site = + +output_site = irods_iplant + +job_clustering = label + + diff --git a/workflow-generator/conf/tacc-stampede/sites.catalog.template b/workflow-generator/conf/tacc-stampede/sites.catalog.template new file mode 100644 index 0000000..48fe4d2 --- /dev/null +++ b/workflow-generator/conf/tacc-stampede/sites.catalog.template @@ -0,0 +1,40 @@ + + + + + + + + + + $pegasus_bin:$irods_bin:/usr/bin:/bin + $home/.ssh/workflow + + + + + + + + mpiexec + /home1/00384/rynge/software/pegasus/4.4.0cvs + /home1/00384/rynge/software/pegasus/4.4.0cvs/bin:/home1/00384/rynge/software/irods/3.2/bin:/scratch/projects/xsede/globus-5.0.4-r1/bin:/usr/bin:/bin + /scratch/projects/xsede/globus-5.0.4-r1/lib + /tmp + $tacc_allocation + + + + + + $home/irods.iplant.env + + + + + + + $home/.ssh/workflow + + + diff --git a/workflow-generator/conf/tacc-stampede/transformations.catalog b/workflow-generator/conf/tacc-stampede/transformations.catalog new file mode 100644 index 0000000..0d3768a --- /dev/null +++ b/workflow-generator/conf/tacc-stampede/transformations.catalog @@ -0,0 +1,19 @@ +tr pegasus::transfer { + site execution { + pfn "/home1/00384/rynge/software/pegasus/4.4.0cvs/bin/pegasus-transfer" + arch "x86_64" + os "linux" + type "INSTALLED" + profile globus "maxwalltime" "1440" + } +} +tr pegasus::mpiexec { + site execution { + pfn "/home1/00384/rynge/software/pegasus-mpi-cluster/pegasus-mpi-cluster" + arch "x86_64" + os "linux" + type "INSTALLED" + profile globus "jobtype" "mpi" + profile globus "maxwalltime" "2880" + } +} diff --git a/workflow-generator/conf/tacc-wrangler/pegasus.conf b/workflow-generator/conf/tacc-wrangler/pegasus.conf new file mode 100644 index 0000000..6820faf --- /dev/null +++ b/workflow-generator/conf/tacc-wrangler/pegasus.conf @@ -0,0 +1,24 @@ +pegasus.metrics.app = Soykb + +pegasus.catalog.site.file = sites.catalog + +pegasus.catalog.transformation.file = transformations.catalog + +pegasus.catalog.replica = File +pegasus.catalog.replica.file = replica.catalog + +pegasus.dir.useTimestamp = true +pegasus.dir.storage.mapper = Flat +pegasus.dir.storage.deep = true +pegasus.condor.logs.symlink = false + +pegasus.data.configuration = sharedfs + +pegasus.transfer.*.remote.sites = execution + +pegasus.transfer.threads = 1 +pegasus.stagein.clusters = 20 +pegasus.stageout.clusters = 20 +pegasus.file.cleanup.clusters.size = 4 + + diff --git a/workflow-generator/conf/tacc-wrangler/replica.catalog b/workflow-generator/conf/tacc-wrangler/replica.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/conf/tacc-wrangler/site.conf b/workflow-generator/conf/tacc-wrangler/site.conf new file mode 100644 index 0000000..a54fd51 --- /dev/null +++ b/workflow-generator/conf/tacc-wrangler/site.conf @@ -0,0 +1,11 @@ + + +[exec_environment] + +staging_site = + +output_site = irods_iplant + +job_clustering = + + diff --git a/workflow-generator/conf/tacc-wrangler/sites.catalog.template b/workflow-generator/conf/tacc-wrangler/sites.catalog.template new file mode 100644 index 0000000..143e4f3 --- /dev/null +++ b/workflow-generator/conf/tacc-wrangler/sites.catalog.template @@ -0,0 +1,46 @@ + + + + + + + + + + $pegasus_bin:$irods_bin:/usr/bin:/bin + $home/.ssh/workflow + + + + + + + + + condor + vanilla + TARGET.FileSystemDomain == "wrangler.tacc.utexas.edu" + TimeToLive + True + /home/00384/rynge/software/pegasus/4.5.1 + /tmp + + + + + + $home/irods.iplant.json + + + + + + + + + + + $home/.ssh/workflow + + + diff --git a/workflow-generator/conf/tacc-wrangler/transformations.catalog b/workflow-generator/conf/tacc-wrangler/transformations.catalog new file mode 100644 index 0000000..e69de29 diff --git a/workflow-generator/editWorkflow.py b/workflow-generator/editWorkflow.py new file mode 100755 index 0000000..a4f77ab --- /dev/null +++ b/workflow-generator/editWorkflow.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python2 + +import json +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('-p', '--path', help='path to workflow.json', default="workflow.json") +parser.add_argument('-n', '--name', help='name of workflow', default="soykb") +parser.add_argument('-v', '--version', help='version of workflow', default="1.0.0") +args = parser.parse_args() + +with open(args.path, "r") as file: + contents = file.read() +wf = json.loads(contents) +wf["name"] = args.name +wf["version"] = args.version + +with open(args.path, "w") as file: + json.dump(wf, file, indent=4, sort_keys=True) \ No newline at end of file diff --git a/workflow-generator/fillFastqFile.py b/workflow-generator/fillFastqFile.py new file mode 100755 index 0000000..f208a2c --- /dev/null +++ b/workflow-generator/fillFastqFile.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python2 +import argparse +prefix = "http://workflow.isi.edu/SoyKB/sample-inputs-3/" +fastq_data = [ + "USB-001_1.fastq", "USB-001_2.fastq", "USB-002_1.fastq", "USB-002_2.fastq", "USB-003_1.fastq", "USB-003_2.fastq", "USB-004_1.fastq", "USB-004_2.fastq", "USB-005_1.fastq", "USB-005_2.fastq", "USB-006_1.fastq", "USB-006_2.fastq", "USB-007_1.fastq", "USB-007_2.fastq", "USB-008_1.fastq", "USB-008_2.fastq", "USB-009_1.fastq", "USB-009_2.fastq", "USB-010_1.fastq", "USB-010_2.fastq", "USB-011_1.fastq", "USB-011_2.fastq", "USB-012_1.fastq", "USB-012_2.fastq", "USB-013_1.fastq", "USB-013_2.fastq", "USB-014_1.fastq", "USB-014_2.fastq", "USB-015_1.fastq", "USB-015_2.fastq", "USB-016_1.fastq", "USB-016_2.fastq", "USB-017_1.fastq", "USB-017_2.fastq", "USB-018_1.fastq", "USB-018_2.fastq", "USB-019_1.fastq", "USB-019_2.fastq", "USB-020_1.fastq", "USB-020_2.fastq", "USB-021_1.fastq", "USB-021_2.fastq", "USB-022_1.fastq", "USB-022_2.fastq", "USB-023_1.fastq", "USB-023_2.fastq", "USB-024_1.fastq", "USB-024_2.fastq", "USB-025_1.fastq", "USB-025_2.fastq" +] + +parser = argparse.ArgumentParser() +parser.add_argument('-p', '--path', help='path to inputs-fastq.txt file', default="inputs-fastq.txt") +parser.add_argument('-s', '--size', help='size to fill inputs-fastq.txt with. Should be divisible by 2 for workflow to work. Max available value: {0}'.format(len(fastq_data)), default="2") +args = parser.parse_args() + + +lines = [prefix + fastq_data[i] for i in range(int(args.size))] + +with open(args.path, "w") as file: + file.writelines('\n'.join(lines)) + file.close() \ No newline at end of file diff --git a/workflow-generator/generate-workflow b/workflow-generator/generate-workflow new file mode 100755 index 0000000..d8df3dc --- /dev/null +++ b/workflow-generator/generate-workflow @@ -0,0 +1,17 @@ +#!/bin/sh +if [ $# -eq 0 ]; then + echo -e "Usage: generate-workflow \nSize should be divisible by 2 for this configuration of soykb." + exit 1 +fi +if [ -d "/workdir" ]; then + echo "Installing config files in /workdir..." +else + exit 1 +fi + +python2 fillFastqFile.py -p inputs-fastq.txt -s $1 +python2 workflow-generator --exec-env tacc-stampede && hflow-convert-dax data/soykb.dax > data/workflow.json +python2 editWorkflow.py -p data/workflow.json -n soykb -v 1.0.0 +cp inputs-fastq.txt data/ + +cp -r data /workdir/ diff --git a/workflow-generator/inputs-fastq.txt b/workflow-generator/inputs-fastq.txt new file mode 100644 index 0000000..b6e7328 --- /dev/null +++ b/workflow-generator/inputs-fastq.txt @@ -0,0 +1,2 @@ +http://workflow.isi.edu/SoyKB/sample-inputs-3/USB-001_1.fastq +http://workflow.isi.edu/SoyKB/sample-inputs-3/USB-001_2.fastq \ No newline at end of file diff --git a/workflow-generator/inputs-ref.txt b/workflow-generator/inputs-ref.txt new file mode 100644 index 0000000..10c6e9f --- /dev/null +++ b/workflow-generator/inputs-ref.txt @@ -0,0 +1 @@ +http://workflow.isi.edu/SoyKB/ref/Gmax_275_v2.0.fa diff --git a/workflow-generator/workflow-generator b/workflow-generator/workflow-generator new file mode 100755 index 0000000..262344b --- /dev/null +++ b/workflow-generator/workflow-generator @@ -0,0 +1,1037 @@ +#!/usr/bin/env python + +from Pegasus.AutoADAG import * +import ConfigParser +from Pegasus.DAX3 import * +import getpass +import logging +import math +import optparse +import os +import re +import socket +import string +import subprocess +import sys +import time + + + +# to setup python lib dir for importing Pegasus PYTHON DAX API +# pegasus_config = os.path.join("pegasus-config") + " --noeoln --python" +# lib_dir = subprocess.Popen(pegasus_config, + # stdout=subprocess.PIPE, + # shell=True).communicate()[0] +#Insert this directory in our search path +# os.sys.path.insert(0, lib_dir) + + +# --- global variables ---------------------------------------------------------------- + +logger = logging.getLogger("my_logger") +conf = None +added_execs = [] + + +# --- classes ------------------------------------------------------------------------- + +class ComputeJob(Job): + """ A Pegasus DAX Job with extra information such as cpu and memory + requirements, for both single and peagaus-mpi-cluster execution + """ + + def __init__(self, name, cores=1, mem_gb=2, partition="part1"): + Job.__init__(self, name=name) + + # label based clustering + self.addProfile(Profile(Namespace.PEGASUS, + key="label", + value=partition)) + + # standard resource requirements for all jobs + mem_mb = mem_gb * 1000 + self.addProfile(Profile(Namespace.CONDOR, + key="request_cpus", + value=str(cores))) + self.addProfile(Profile(Namespace.PEGASUS, + key="pmc_request_cpus", + value=str(cores))) + self.addProfile(Profile(Namespace.CONDOR, + key="request_memory", + value=str(mem_mb))) + self.addProfile(Profile(Namespace.PEGASUS, + key="pmc_request_memory", + value=str(mem_mb))) + self.addProfile(Profile(Namespace.CONDOR, + key="request_disk", + value=str(20*1024*1024))) + self.addProfile(Profile(Namespace.GLOBUS, + key="totalmemory", + value=str(mem_mb))) + + # special sauce for TACC - we want smaller jobs to go to the normal + # compute nodes and the large memory ones to go to the large memory + # nodes + if re.search('stampede', conf.get("local", "exec_env")): + hosts = conf.get("exec_environment", "hosts_" + partition) + cores = str(16 * int(hosts)) + self.addProfile(Profile(Namespace.GLOBUS, + key="queue", + value="normal")) + self.addProfile(Profile(Namespace.GLOBUS, + key="hostcount", + value=hosts)) + self.addProfile(Profile(Namespace.GLOBUS, + key="count", + value=cores)) + self.addProfile(Profile(Namespace.ENV, + key="PMC_HOST_MEMORY", + value="29000")) + + # let the GATK jobs know how much memory to use (requested - 2GB for Java) + #gatk_memory = mem_gb - 2 + + # required for the Pegasus accounting + self.addProfile(Profile(Namespace.PEGASUS, + key="cores", + value=str(cores))) + + + +# --- functions ----------------------------------------------------------------------- + + +def setup_logger(verbose): + """ Use a console logger for all output to the user """ + + # log to the console + console = logging.StreamHandler() + + # default log level - make logger/console match + logger.setLevel(logging.INFO) + console.setLevel(logging.INFO) + + if verbose: + logger.setLevel(logging.DEBUG) + console.setLevel(logging.DEBUG) + + # formatter + formatter = logging.Formatter("%(asctime)s %(levelname)7s: %(message)s") + console.setFormatter(formatter) + logger.addHandler(console) + logger.debug("Logger has been configured") + + +def myexec(cmd_line): + """ Convenience function as we are shelling out a fair amount """ + + sys.stdout.flush() + p = subprocess.Popen(cmd_line + " 2>&1", shell=True) + stdoutdata, stderrdata = p.communicate() + r = p.returncode + if r != 0: + raise RuntimeError("Command '%s' failed with error code %s" \ + % (cmd_line, r)) + + +def proxy_check(): + """ Verify that the user has a proxy and it is valid for a long time """ + p = subprocess.Popen("grid-proxy-info -timeleft", shell=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdoutdata, stderrdata = p.communicate() + r = p.returncode + if r != 0: + logger.error(stderrdata) + raise RuntimeError("Unable to run the grid-proxy-info command." + \ + "Do you have a valid proxy?") + timeleft = int(stdoutdata) + + # two weeks minimum lifetime + if timeleft < (60*60*24*10): + raise RuntimeError("There is not enough time left on your grid" + + " proxy. Please renew, then run this command" + + " again") + + +def generate_site_catalog(): + """ Uses a templete file to produce the Pegasus site catalog """ + + logger.info("Generating sites.catalog") + inf = open("conf/" + conf.get("local", "exec_env") + + "/sites.catalog.template", 'r') + template = string.Template(inf.read()) + inf.close() + + outf = open(conf.get("local", "work_dir") + "/sites.catalog", "w") + outf.write(template.substitute( + submit_host = socket.gethostname(), + username = getpass.getuser(), + home = os.path.expanduser('~'), + top_dir = conf.get("local", "top_dir"), + work_dir = conf.get("local", "work_dir"), + pegasus_bin = conf.get("local", "pegasus_bin"), + irods_bin = conf.get("local", "irods_bin"), + tacc_allocation = conf.get("tacc", "allocation"), + tacc_username = conf.get("tacc", "username"), + tacc_storage_group = conf.get("tacc", "storage_group"), + )) + outf.close() + + +def read_input_lists(ref_urls, chromosomes, fastq_urls): + """ The user provides a list of reference file URLs and pairs of fastq + URLs to be processed. + """ + + # first the reference + inf = open(conf.get("local", "top_dir") + "/inputs-ref.txt", "r") + for line in inf.readlines(): + line = line.rstrip('\n') + if len(line) > 0: + ref_urls.append(line) + inf.close() + + # chromosomes + inf = open(conf.get("local", "top_dir") + "/chromosomes.txt", "r") + for line in inf.readlines(): + line = line.rstrip('\n') + line = re.sub("^> *", "", line) + line = re.sub(" .*", "", line) + if len(line) > 0: + logger.info(" Added chromosome: " + line) + chromosomes.append(line) + inf.close() + + inf = open(conf.get("local", "top_dir") + "/inputs-fastq.txt", "r") + for line in inf.readlines(): + line = line.rstrip('\n') + if len(line) > 0: + fastq_urls.append(line) + inf.close() + + # sanity checks + if len(ref_urls) != 1: + logger.error("Only one reference genome can be listed in inputs-ref.txt"); + sys.exit(1) + if len(fastq_urls) == 0: + logger.error("Did not find fastq files") + sys.exit(1) + if conf.get("main", "inputs-style") == "pair-end" and len(fastq_urls) % 2 != 0: + logger.error("Found an uneven number of fastq files in input list") + sys.exit(1) + + +def extract_lfn(url): + """ determine a logical file name (basename) from a given URL """ + return re.sub(".*/", "", url) + + +def local_pfn(path): + """ generates a full pfn given a local path """ + pfn = PFN("scp://" + getpass.getuser() + "@" + socket.gethostname() + "/" + path, "local") + if re.search('rc4', conf.get("local", "exec_env")): + pfn = PFN("file://" + path, "execution") + return pfn + + +def extract_fasta_basefile(file_list): + """ find the base fasta file given a list of reference files """ + for f in file_list: + if re.search("(.fa|.fasta)$", f.name): + return f + + +def extract_sample_name(url): + """ sample name is the fist part of the base name (for example: + HN0001 from HN001_FCD1P1JACXX_L6_SZAIPI024836-36_1.fq) + """ + name = re.sub(".*/", "", url) + name = re.sub("_.*", "", name) + name = re.sub("\..*", "", name) + return name + + +def add_executable(dax, logical_name, wrapper_name): + """ adds executables to the DAX-level replica catalog """ + global added_execs + + if logical_name in added_execs: + return + + wrapper = Executable(name=logical_name, + arch="x86_64", + installed=False) + wrapper.addPFN(local_pfn(conf.get("local", "top_dir") + "/wrappers/" + wrapper_name)) + dax.addExecutable(wrapper) + + added_execs.append(logical_name) + + +def gunzip_job(dax, software_tar, f_in, f_out, parent_jobs=None): + """ adds a job to gunzip an input file + """ + add_executable(dax, "gunzip", "gunzip-wrapper") + j = ComputeJob("gunzip", cores = 1, mem_gb = 2, partition = "part1") + j.uses(software_tar, link=Link.INPUT) + j.uses(f_in, link=Link.INPUT) + j.uses(f_out, link=Link.OUTPUT, transfer=False) + j.addArguments(f_in, f_out) + dax.addJob(j) + if parent_jobs is not None: + for parent in parent_jobs: + dax.depends(parent=parent, child=j) + + +def prepare_ref_genome(dax, software_job, software_tar, ref_url, ref_files): + + # temp list of files we need to keep track of + files = {} + + lfn = extract_lfn(ref_url) + lfn_base = re.sub("\.[a-zA-Z0-9]+$", "", lfn) + lfn_ext = re.sub(".*\.", "", lfn) + + j1 = None + if lfn_ext == "gz": + # add a job to gunzip the input + in_f = File(lfn) + in_f.addPFN(PFN(ref_url, "irods_iplant")) + dax.addFile(in_f) + + fa_f = File(lfn_base + ".fa") + gunzip_job(dax, software_tar, in_f, fa_f, [software_job]) + + # update the lfn for subsequent jobs + lfn = lfn_base + ".fa" + lfn_ext = "fa" + elif lfn_ext == "fa": + # already a .fa file + fa_f = File(lfn) + fa_f.addPFN(PFN(ref_url, "irods_iplant")) + dax.addFile(fa_f) + else: + logger.error("Unable to handle reference genome with %s extension" %(lfn_ext)) + sys.exit(1) + files[fa_f.name] = fa_f + + # bwa index + add_executable(dax, "bwa-index", "bwa-wrapper") + j2 = ComputeJob("bwa-index", cores = 1, mem_gb = 4, partition = "part1") + j2.uses(software_tar, link=Link.INPUT) + j2.uses(fa_f, link=Link.INPUT) + for ext in [".amb", ".ann", ".bwt", ".pac", ".sa"]: + f = File(lfn + ext) + files[f.name] = f + j2.uses(f, link=Link.OUTPUT, transfer=False) + j2.addArguments("index", fa_f) + dax.addJob(j2) + dax.depends(parent=software_job, child=j2) + + # samtools faidx + add_executable(dax, "faidx", "samtools-wrapper") + j3 = ComputeJob("faidx", cores = 1, mem_gb = 4, partition = "part1") + j3.uses(software_tar, link=Link.INPUT) + j3.uses(fa_f, link=Link.INPUT) + f = File(lfn_base + ".fa.fai") + files[f.name] = f + j3.uses(f, link=Link.OUTPUT, transfer=False) + j3.addArguments("faidx", fa_f) + dax.addJob(j3) + dax.depends(parent=software_job, child=j3) + + # picard sequence dictionary + add_executable(dax, "seq_dict", "picard-wrapper") + j4 = ComputeJob("seq_dict", cores = 1, mem_gb = 4, partition = "part1") + j4.uses(software_tar, link=Link.INPUT) + j4.uses(fa_f, link=Link.INPUT) + f = File(lfn_base + ".dict") + files[f.name] = f + j4.uses(f, link=Link.OUTPUT, transfer=False) + j4.addArguments("CreateSequenceDictionary.jar", + "REFERENCE=" + fa_f.name, + "OUTPUT=" + f.name) + dax.addJob(j4) + dax.depends(parent=software_job, child=j4) + + # populate the reference file list which the rest of the workflow needs + for key, f in files.iteritems(): + ref_files.append(f) + + +def alignment_to_reference(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files): + + # Note that the cores we give Pegasus and the -t does not match. + # Oversubscriptions is ok, as bwa can not keep all the cores busy 100% + # of the time. + add_executable(dax, "alignment_to_reference", "bwa-wrapper") + j = ComputeJob("alignment_to_reference", cores = 6, mem_gb = 8, + partition = "part1") + + # determine which is the fasta file + for f in ref_files: + j.uses(f, link=Link.INPUT) + j.uses(software_tar, link=Link.INPUT) + j.uses(tracked_files['sam'], link=Link.OUTPUT, transfer=False) + j.setStdout(tracked_files['sam']) + + if conf.get("main", "inputs-style") == "single-end": + # single-end inputs + j.uses(tracked_files['fastq_input'], link=Link.INPUT) + j.addArguments("mem", + "-t", "12", + "-M", extract_fasta_basefile(ref_files), + tracked_files['fastq_input']) + else: + # pair-end inputs + j.uses(tracked_files['paired_read1_fastq'], link=Link.INPUT) + j.uses(tracked_files['paired_read2_fastq'], link=Link.INPUT) + j.addArguments("mem", + "-t", "12", + "-M", extract_fasta_basefile(ref_files), + tracked_files['paired_read1_fastq'], + tracked_files['paired_read2_fastq']) + dax.addJob(j) + + # next step + sortsam_job = sort_sam(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files) + + return j + + +def sort_sam(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files): + + add_executable(dax, "sort_sam", "picard-wrapper") + j = ComputeJob("sort_sam", cores = 1, mem_gb = 16, partition = "part1") + j.uses(software_tar, link=Link.INPUT) + j.uses(tracked_files['sam'], link=Link.INPUT) + j.uses(tracked_files['sorted_reads'], link=Link.OUTPUT, transfer=False) + j.uses(tracked_files['sorted_index'], link=Link.OUTPUT, transfer=False) + + j.addArguments("SortSam.jar", + "CREATE_INDEX=TRUE", + "MAX_RECORDS_IN_RAM=5000000", + "I=" + tracked_files['sam'].name, + "O=" + tracked_files['sorted_reads'].name, + "SO=coordinate", + "VALIDATION_STRINGENCY=LENIENT") + dax.addJob(j) + + dedup_job = dedup(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files) + + + +def dedup(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files): + add_executable(dax, "dedup", "picard-wrapper") + j = ComputeJob("dedup", cores = 1, mem_gb = 16, + partition = "part1") + j.uses(software_tar, link=Link.INPUT) + j.uses(tracked_files['sorted_reads'], link=Link.INPUT) + j.uses(tracked_files['sorted_index'], link=Link.INPUT) + j.uses(tracked_files['deduped_reads'], link=Link.OUTPUT, transfer=False) + j.uses(tracked_files['deduped_index'], link=Link.OUTPUT, transfer=False) + #j.uses(tracked_files['deduped_metrics'], link=Link.OUTPUT, transfer=False) + + j.addArguments("MarkDuplicates.jar", + "CREATE_INDEX=TRUE", + "MAX_RECORDS_IN_RAM=5000000", + "I=" + tracked_files['sorted_reads'].name, + "O=" + tracked_files['deduped_reads'].name, + "METRICS_FILE=" + tracked_files['deduped_metrics'].name, + "VALIDATION_STRINGENCY=LENIENT") + + dax.addJob(j) + + add_replace(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files) + + +def add_replace(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files): + add_executable(dax, "add_replace", "picard-wrapper") + j = ComputeJob("add_replace", cores = 1, mem_gb = 16, + partition = "part1") + j.uses(software_tar, link=Link.INPUT) + j.uses(tracked_files['deduped_reads'], link=Link.INPUT) + j.uses(tracked_files['deduped_index'], link=Link.INPUT) + j.uses(tracked_files['addrepl_reads'], link=Link.OUTPUT, transfer=True) + j.uses(tracked_files['addrepl_index'], link=Link.OUTPUT, transfer=True) + + j.addArguments("AddOrReplaceReadGroups.jar", + "MAX_RECORDS_IN_RAM=5000000", + "I=" + tracked_files['deduped_reads'].name, + "O=" + tracked_files['addrepl_reads'].name, + "RGID=" + sample_name, + "LB=" + sample_name, + "PL=Illumina", + "SM=" + sample_name, + "CN=BGI", + "RGPU=" + sample_name, + "VALIDATION_STRINGENCY=LENIENT", + "SORT_ORDER=coordinate", + "CREATE_INDEX=TRUE") + dax.addJob(j) + + realign_target_creator(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files) + + +def realign_target_creator(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files): + add_executable(dax, "realign_target_creator", "gatk-wrapper") + j = ComputeJob("realign_target_creator", cores = 15, mem_gb = 10, + partition = "part1") + j.uses(software_tar, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + j.uses(tracked_files['addrepl_reads'], link=Link.INPUT) + j.uses(tracked_files['addrepl_index'], link=Link.INPUT) + j.uses(tracked_files['intervals'], link=Link.OUTPUT, transfer=False) + + j.addArguments("10", # memory + "-T", "RealignerTargetCreator", + "-nt", "15", + "-R", extract_fasta_basefile(ref_files), + "-I", tracked_files['addrepl_reads'], + "-o", tracked_files['intervals']) + dax.addJob(j) + + indel_realign(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files) + + +def indel_realign(sample_name, dax, software_tar, chromosomes, ref_files, tracked_files): + # IndelRealigner can only be run single threaded + add_executable(dax, "indel_realign", "gatk-wrapper") + j = ComputeJob("indel_realign", cores = 1, mem_gb = 10, + partition = "part1") + j.uses(software_tar, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + j.uses(tracked_files['addrepl_reads'], link=Link.INPUT) + j.uses(tracked_files['addrepl_index'], link=Link.INPUT) + j.uses(tracked_files['intervals'], link=Link.INPUT) + j.uses(tracked_files['indel_realigned_reads'], link=Link.OUTPUT, transfer=True) + j.uses(tracked_files['indel_realigned_index'], link=Link.OUTPUT, transfer=True) + + j.addArguments("10", # memory + "-T", "IndelRealigner", + "-R", extract_fasta_basefile(ref_files), + "-I", tracked_files['addrepl_reads'], + "-targetIntervals", tracked_files['intervals'], + "-o", tracked_files['indel_realigned_reads']) + dax.addJob(j) + + for chr in chromosomes: + haplotype_caller(sample_name, dax, software_tar, + ref_files, tracked_files, chr) + + +def select_and_filter_snp(dax, software_file, ref_files, tracked_files, + in_file, out_file, out_idx): + + # we need an intermediate file + intername = re.sub(".*/", "", in_file.name) + "_snp_only.vcf" + tracked_files[intername] = File(intername) + + add_executable(dax, "select_variants_snp", "gatk-wrapper") + j = ComputeJob("select_variants_snp", cores = 14, mem_gb = 10, + partition = "part3") + + # inputs + j.uses(software_file, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + j.uses(in_file, link=Link.INPUT) + + # outputs + j.uses(tracked_files[intername], link=Link.OUTPUT, transfer=False) + + j.addArguments("10", # memory + "-T", "SelectVariants", + "-nt", "15", + "-R", extract_fasta_basefile(ref_files), + "-selectType", "SNP", + "-V", in_file, + "-o", tracked_files[intername]) + + dax.addJob(j) + + add_executable(dax, "filtering_snp", "gatk-wrapper") + j = ComputeJob("filtering_snp", cores = 1, mem_gb = 10, + partition = "part3") + + # inputs + j.uses(software_file, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + j.uses(tracked_files[intername], link=Link.INPUT) + + # outputs + j.uses(out_file, link=Link.OUTPUT, transfer=True) + j.uses(out_idx, link=Link.OUTPUT, transfer=True) + + j.addArguments("10", # memory + "-T", "VariantFiltration", + "-R", extract_fasta_basefile(ref_files), + "-V", tracked_files[intername], + "--filterExpression", "'" + conf.get("main", "snp_filter") + "'", + "--filterName", "my_snp_filter", + "-o", out_file) + + dax.addJob(j) + + +def select_and_filter_indel(dax, software_file, ref_files, tracked_files, + in_file, out_file, out_idx): + + # we need an intermediate file + intername = re.sub(".*/", "", in_file.name) + "_indel_only.vcf" + tracked_files[intername] = File(intername) + + + add_executable(dax, "select_variants_indel", "gatk-wrapper") + j = ComputeJob("select_variants_indel", cores = 14, mem_gb = 10, + partition = "part3") + + # inputs + j.uses(software_file, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + j.uses(in_file, link=Link.INPUT) + + # outputs + j.uses(tracked_files[intername], link=Link.OUTPUT, transfer=False) + + j.addArguments("10", # memory + "-T", "SelectVariants", + "-nt", "15", + "-R", extract_fasta_basefile(ref_files), + "-selectType", "INDEL", + "-V", in_file, + "-o", tracked_files[intername]) + + dax.addJob(j) + + add_executable(dax, "filtering_indel", "gatk-wrapper") + j = ComputeJob("filtering_indel", cores = 1, mem_gb = 10, + partition = "part3") + + # inputs + j.uses(software_file, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + j.uses(tracked_files[intername], link=Link.INPUT) + + # outputs + j.uses(out_file, link=Link.OUTPUT, transfer=True) + j.uses(out_idx, link=Link.OUTPUT, transfer=True) + + j.addArguments("10", # memory + "-T", "VariantFiltration", + "-R", extract_fasta_basefile(ref_files), + "-V", tracked_files[intername], + "--filterExpression", "'" + conf.get("main", "indel_filter") + "'", + "--filterName", "my_indel_filter", + "-o", out_file) + + dax.addJob(j) + + +def haplotype_caller(sample_name, dax, software_file, ref_files, tracked_files, + chromosome): + + add_executable(dax, "haplotype_caller", "gatk-wrapper") + j = ComputeJob("haplotype_caller", cores = 1, mem_gb = 3, partition = "part2") + + # inputs + j.uses(software_file, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + j.uses(tracked_files['indel_realigned_reads'], link=Link.INPUT) + j.uses(tracked_files['indel_realigned_index'], link=Link.INPUT) + + # outputs + fname = conf.get("local", "run_id") + "-" + sample_name + "_" + chromosome + ".vcf" + tracked_files[fname] = File(fname) + j.uses(tracked_files[fname], link=Link.OUTPUT, transfer=False) + tracked_files[fname + ".idx"] = File(fname + ".idx") + j.uses(tracked_files[fname + ".idx"], link=Link.OUTPUT, transfer=False) + + j.addArguments("4", # memory + "-T", "HaplotypeCaller", + "--emitRefConfidence", "GVCF", + "--variant_index_type", "LINEAR", + "--variant_index_parameter", "128000", + "-L", chromosome, + "-R", extract_fasta_basefile(ref_files), + "-I", tracked_files['indel_realigned_reads'], + "-o", tracked_files[fname]) + dax.addJob(j) + + +def merge_gvcf(dax, software_file, chromosomes, ref_files, tracked_files, sample_names): + + # memory and cores based on what system we are targetting + cores = 1 + mem_gb = 20 + if re.search('wrangler', conf.get("local", "exec_env")): + mem_gb = 80 + + add_executable(dax, "merge_gcvf", "gatk-wrapper") + j = ComputeJob("merge_gcvf", cores = cores, mem_gb = mem_gb, partition = "part3") + + # inputs + files = [] + j.uses(software_file, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + for s in sample_names: + for chr in chromosomes: + fname = "%s-%s_%s.vcf" % (conf.get("local", "run_id"), s, chr) + j.uses(tracked_files[fname + ".idx"], link=Link.INPUT) + j.uses(tracked_files[fname], link=Link.INPUT) + files.append(fname) + + # create filelist to minimize the length of the command line + fd = open(conf.get("local", "work_dir") + "/haplotype-files.list", "w") + for f in files: + fd.write("%s\n" %(f)) + fd.close() + hf = File("haplotype-files.list") + hf.addPFN(local_pfn(conf.get("local", "work_dir") + "/haplotype-files.list")) + dax.addFile(hf) + j.uses("haplotype-files.list", link=Link.INPUT) + + # outputs + fname = conf.get("local", "run_id") + "-mergeGVCF.vcf" + tracked_files[fname] = File(fname) + j.uses(tracked_files[fname], link=Link.OUTPUT, transfer=True) + tracked_files[fname + ".idx"] = File(fname + ".idx") + j.uses(tracked_files[fname + ".idx"], link=Link.OUTPUT, transfer=True) + + j.addArguments(str(mem_gb), # first argument is memory + "-T", "CombineGVCFs", + "-R", extract_fasta_basefile(ref_files), + "-o", tracked_files[fname], + "--variant", "haplotype-files.list") + + dax.addJob(j) + + +def genotype_gvcfs(dax, software_file, ref_files, tracked_files, sample_names, + chromosome): + + add_executable(dax, "genotype_gvcfs", "gatk-wrapper") + j = ComputeJob("genotype_gvcfs", cores = 1, mem_gb = 10, + partition = "part2") + + # inputs + variant_files = [] + j.uses(software_file, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + for sname in sample_names: + fname = conf.get("local", "run_id") + "-" + sname + "_" + chromosome + ".vcf" + f = tracked_files[fname] + j.uses(f, link=Link.INPUT) + j.uses(tracked_files[fname + ".idx"], link=Link.INPUT) + variant_files.append(f) + + # outputs + fname = conf.get("local", "run_id") + "-" + "GVCF_" + chromosome + ".vcf" + tracked_files[fname] = File(fname) + j.uses(tracked_files[fname], link=Link.OUTPUT, transfer=False) + tracked_files[fname + ".idx"] = File(fname + ".idx") + j.uses(tracked_files[fname + ".idx"], link=Link.OUTPUT, transfer=False) + + j.addArguments("10", # memory + "-T", "GenotypeGVCFs", + "-R", extract_fasta_basefile(ref_files), + "-o", tracked_files[fname], + "-L", chromosome) + for f in variant_files: + j.addArguments("--variant", f) + + dax.addJob(j) + + +def combine_variants(dax, software_file, chromosomes, ref_files, tracked_files): + + add_executable(dax, "combine_variants", "gatk-wrapper") + j = ComputeJob("combine_variants", cores = 1, mem_gb = 10, + partition = "part3") + + # inputs + j.uses(software_file, link=Link.INPUT) + for f in ref_files: + j.uses(f, link=Link.INPUT) + for chr in chromosomes: + fname = conf.get("local", "run_id") + "-" + "GVCF_%s.vcf" % (chr) + j.uses(tracked_files[fname], link=Link.INPUT) + j.uses(tracked_files[fname + ".idx"], link=Link.INPUT) + + # outputs + fname = conf.get("local", "run_id") + "-" + "All.vcf" + tracked_files[fname] = File(fname) + j.uses(tracked_files[fname], link=Link.OUTPUT, transfer=True) + tracked_files[fname + ".idx"] = File(fname + ".idx") + j.uses(tracked_files[fname + ".idx"], link=Link.OUTPUT, transfer=True) + + j.addArguments("10", # memory + "-T", "CombineVariants", + "--genotypemergeoption", "UNIQUIFY", + "-R", extract_fasta_basefile(ref_files), + "-o", tracked_files[fname]) + for chr in chromosomes: + fname = conf.get("local", "run_id") + "-" + "GVCF_%s.vcf" % (chr) + j.addArguments("--variant", tracked_files[fname]) + + dax.addJob(j) + + # filter the results + tracked_files[conf.get("local", "run_id") + "-" + 'All_filtered_snp.vcf'] = \ + File(conf.get("local", "run_id") + "-" + "All_filtered_snp.vcf") + tracked_files[conf.get("local", "run_id") + "-" + 'All_filtered_snp.vcf.idx'] = \ + File(conf.get("local", "run_id") + "-" + "All_filtered_snp.vcf.idx") + select_and_filter_snp(dax, software_file, ref_files, tracked_files, + tracked_files[conf.get("local", "run_id") + "-" + 'All.vcf'], + tracked_files[conf.get("local", "run_id") + "-" + 'All_filtered_snp.vcf'], + tracked_files[conf.get("local", "run_id") + "-" + 'All_filtered_snp.vcf.idx']) + + tracked_files[conf.get("local", "run_id") + "-" + 'All_filtered_indel.vcf'] = \ + File(conf.get("local", "run_id") + "-" + "All_filtered_indel.vcf") + tracked_files[conf.get("local", "run_id") + "-" + 'All_filtered_indel.vcf.idx'] = \ + File(conf.get("local", "run_id") + "-" + "All_filtered_indel.vcf.idx") + select_and_filter_indel(dax, software_file, ref_files, tracked_files, + tracked_files[conf.get("local", "run_id") + "-" + 'All.vcf'], + tracked_files[conf.get("local", "run_id") + "-" + 'All_filtered_indel.vcf'], + tracked_files[conf.get("local", "run_id") + "-" + 'All_filtered_indel.vcf.idx']) + + +def generate_dax(): + """ generates the Pegasus DAX (directed acyclic graph - abstract XML) + which is a description of a workflow """ + + logger.info("Generating abstract workflow (DAX)") + + dax = AutoADAG("soykb") + + # The key to adding jobs to this workflow is the AutoADAG - it allows you + # to add jobs with listed input and output files, and then the AutoADAG + # will figure out the relationships between the jobs. There is no need + # to list parent/child relationships, but you can do that if you feel it + # makes the relationships more clear than just specifying the + # inputs/outputs. + + # email notificiations for when the state of the workflow changes + dax.invoke('all', conf.get("local", "top_dir") + "/email-notify") + + ref_urls = [] + chromosomes = [] + fastq_urls = [] + read_input_lists(ref_urls, chromosomes, fastq_urls) + + # determine how many TACC compute nodes we need + num_inputs_in_set = min(len(fastq_urls) / 2, 100) + conf.set("exec_environment", "hosts_part1", str( (num_inputs_in_set // 16 + 1) * 4 )) + conf.set("exec_environment", "hosts_part2", str( (num_inputs_in_set // 16 + 1) * 4 )) + conf.set("exec_environment", "hosts_part3", str( 1 )) + + # we need to bring a copy of the software with us + software_tar = File("software.tar.gz") + software_tar.addPFN(local_pfn(conf.get("local", "work_dir") + "/software.tar.gz")) + dax.addFile(software_tar) + add_executable(dax, "software-wrapper", "software-wrapper") + software_job = ComputeJob("software-wrapper", cores=1, mem_gb=1) + software_job.uses(software_tar, link=Link.INPUT) + dax.addJob(software_job) + + # we need to track files across jobs + tracked_files = {} + sample_names = [] + + # reference genome - add some jobs to prepare reference genome + ref_files = [] + prepare_ref_genome(dax, software_job, software_tar, ref_urls[0], ref_files) + + lane_count = len(fastq_urls) + if conf.get("main", "inputs-style") == "pair-end": + lane_count = len(fastq_urls) / 2 + + for lane in range(lane_count): + + # input files for this lane + if conf.get("main", "inputs-style") == "single-end": + lfn = extract_lfn(fastq_urls[lane]) + if re.search("\.gz$", lfn): + f_gz = File(extract_lfn(fastq_urls[lane])) + f_gz.addPFN(PFN(fastq_urls[lane], "irods_iplant")) + dax.addFile(f_gz) + fa_name = extract_lfn(fastq_urls[lane]) + fa_name = re.sub("\..*", ".fa", fa_name) + f_fa = File(fa_name) + gunzip_job(dax, software_tar, f_gz, f_fa, [software_job]) + else: + f_fa = File(extract_lfn(fastq_urls[lane])) + f_fa.addPFN(PFN(fastq_urls[lane], "irods_iplant")) + dax.addFile(f_fa) + tracked_files['fastq_input'] = f_fa + sample_name = extract_sample_name(tracked_files['fastq_input'].name) + else: + tracked_files['paired_read1_fastq'] = File(extract_lfn(fastq_urls[lane * 2])) + tracked_files['paired_read1_fastq'].addPFN(PFN(fastq_urls[lane * 2], "irods_iplant")) + dax.addFile(tracked_files['paired_read1_fastq']) + + tracked_files['paired_read2_fastq'] = File(extract_lfn(fastq_urls[lane * 2 + 1])) + tracked_files['paired_read2_fastq'].addPFN(PFN(fastq_urls[lane * 2 + 1], "irods_iplant")) + dax.addFile(tracked_files['paired_read2_fastq']) + + sample_name = extract_sample_name(tracked_files['paired_read1_fastq'].name) + + # files we need to track + tracked_files['sam'] = File(conf.get("local", "run_id") + "-" + sample_name + "_aligned_reads.sam") + tracked_files['sorted_reads'] = File(conf.get("local", "run_id") + "-" + sample_name + "_sorted_reads.bam") + tracked_files['sorted_index'] = File(conf.get("local", "run_id") + "-" + sample_name + "_sorted_reads.bai") + tracked_files['deduped_reads'] = File(conf.get("local", "run_id") + "-" + sample_name + "_deduped_reads.bam") + tracked_files['deduped_index'] = File(conf.get("local", "run_id") + "-" + sample_name + "_deduped_reads.bai") + tracked_files['deduped_metrics'] = File(conf.get("local", "run_id") + "-" + sample_name + "_deduped.metrics") + tracked_files['addrepl_reads'] = File(conf.get("local", "run_id") + "-" + sample_name + "_addrepl.bam") + tracked_files['addrepl_index'] = File(conf.get("local", "run_id") + "-" + sample_name + "_addrepl.bai") + tracked_files['intervals'] = File(conf.get("local", "run_id") + "-" + sample_name + "_intervals.list") + tracked_files['indel_realigned_reads'] = File(conf.get("local", "run_id") + "-" + sample_name + "_indel_realigned.bam") + tracked_files['indel_realigned_index'] = File(conf.get("local", "run_id") + "-" + sample_name + "_indel_realigned.bai") + + # Step 1 - dependent jobs are now added in the parent jobs + align_job = alignment_to_reference(sample_name, + dax, + software_tar, + chromosomes, + ref_files, + tracked_files) + dax.depends(parent=software_job, child=align_job) + + # keep a list of samples for the GenotypeGVCFs call + sample_names.append(sample_name) + + # combine all haplotype_caller outputs into one merged file for output + merge_gvcf(dax, software_tar, chromosomes, ref_files, tracked_files, sample_names) + + # run genotype_gvcfs per chromosome + for chr in chromosomes: + genotype_gvcfs(dax, software_tar, ref_files, tracked_files, + sample_names, chr) + + combine_variants(dax, software_tar, chromosomes, ref_files, tracked_files) + + # write out the dax + dax_file = open(conf.get("local", "work_dir") + "/soykb.dax", "w") + dax.writeXML(dax_file) + dax_file.close() + + +def main(): + global conf + + setup_logger(True) + + # Configure command line option parser + prog_usage = "usage: workflow-generator [options]" + parser = optparse.OptionParser(usage=prog_usage) + + parser.add_option("-e", "--exec-env", action = "store", dest = "exec_env", + help = "Handle for the target execution environment.") + + # Parse command line options + (options, args) = parser.parse_args() + if options.exec_env == None: + logger.fatal("Please specify an execution environment with --exec-env") + sys.exit(1) + + # read the config file and add those settings to the option object + conf = ConfigParser.SafeConfigParser({'username': getpass.getuser()}) + r = conf.read(["conf/.soybean-workflow.conf", \ + "conf/main.conf", \ + "conf/%s/site.conf" % options.exec_env]) + logger.debug(["conf/.soybean-workflow.conf", \ + "conf/main.conf", \ + "conf/%s/site.conf" % options.exec_env]) + if len(r) != 3: + logger.fatal("Unable to read configuration files for that environment") + sys.exit(1) + + if conf.get("main", "inputs-style") != "single-end" and \ + conf.get("main", "inputs-style") != "pair-end": + logger.fatal("Valid choicses for the main/inputs-style configuration is" + \ + " single-end or pair-end") + sys.exit(1) + + conf.set("local", "username", getpass.getuser()) + conf.set("local", "exec_env", options.exec_env) + conf.set("local", "top_dir", os.path.dirname(os.path.realpath( __file__ ))) + + # run id + conf.set("local", "run_id", time.strftime("%Y%m%d-%H%M%S", time.gmtime())) + + # add the run id to the work dir + # conf.set("local", "work_dir", conf.get("local", "work_dir") + "/" + + # conf.get("local", "run_id")) + + # local Pegasus environment + # pegasus_config = os.path.join("pegasus-config") + " --noeoln --bin" + # pegasus_bin_dir = subprocess.Popen(pegasus_config, + # stdout=subprocess.PIPE, + # shell=True).communicate()[0] + # conf.set("local", "pegasus_bin", pegasus_bin_dir) + conf.set("local", "pegasus_bin", "pegasus_bin_dir") + + # check proxy before doing anything else + #proxy_check() + + # create a local work directory for the workflow + logger.info("Setting up work directory at %s" \ + %(conf.get("local", "work_dir"))) + if os.path.exists(conf.get("local", "work_dir")): + logger.fatal("Work directory already exists") + os.exit(1) + os.makedirs(conf.get("local", "work_dir")) + + # tar up the software + # logger.info("Tarring up software directory to send with jobs") + # myexec("tar czf " + conf.get("local", "work_dir") + \ + # "/software.tar.gz software") + + generate_site_catalog() + + # FIXME: what should we copy / keep in the top dir? + myexec("cp conf/" + conf.get("local", "exec_env") + + "/transformations.catalog " + + conf.get("local", "work_dir") + "/transformations.catalog") + myexec("cp conf/" + conf.get("local", "exec_env") + + "/replica.catalog " + + conf.get("local", "work_dir") + "/replica.catalog") + + generate_dax() + + # submit + logger.info("Planning workflow...") + os.chdir(conf.get("local", "work_dir")) + # cmd = "pegasus-plan " + \ + # " --conf " + conf.get("local", "top_dir") + \ + # "/conf/" + conf.get("local", "exec_env") + "/pegasus.conf" + \ + # " --dir ." + \ + # " --relative-dir wf-" + conf.get("local", "run_id") + \ + # " --sites execution" + + # if conf.get("exec_environment", "output_site") != "": + # cmd += " --output-site " + conf.get("exec_environment", "output_site") + + # if conf.get("exec_environment", "staging_site") != "": + # cmd += " --staging " + conf.get("exec_environment", "staging_site") + + # if conf.get("exec_environment", "job_clustering") != "": + # cmd += " --cluster " + conf.get("exec_environment", "job_clustering") + + # cmd += " --dax soykb.dax" + \ + # " --submit" + # logger.info(cmd) + # myexec(cmd + " 2>&1 | tee pegasus-plan.out") + + +if __name__ == "__main__": + main() +