validator.py 39.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
"""
This file is part of Image Harvest.

Image Harvest is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Image Harvest is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Image Harvest.  If not, see <http://www.gnu.org/licenses/>.
"""
aknecht2's avatar
aknecht2 committed
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
import os
import conf
import sys
import json
import re
import csv
import datetime
import sqlite3
import textwrap
import shutil
import errno
import copy
import traceback
from Pegasus.DAX3 import *

class Validator(object):
    """
    A very, very generic validator.
    """
    def __init__(self, f, type):
        self.err = ""
        if not os.path.isfile(f):
            self.err += "Path Error: Input '%s' doesn't exist.\n" % (f,)
        if not self.err:
            if type == "file":
                with open(f, "r") as rh:
                    try:
                        self.data = json.load(rh)
                    except Exception as e:
                        self.err += "Json Error: File '%s', %s\n" % (f, str(e))
            elif type == "db":
                self.conn = sqlite3.connect(f)
                self.conn.row_factory = sqlite3.Row
                try:
                    result = self.conn.execute("select * from images")
                    if not result.fetchone():
                        self.err += "Database Error: No information in images table\n"
                except Exception as e:
                    self.err += "Database Error: %s\n" % (str(e), )
            else:
                self.err += "Validator Error: Invalid type, must be either 'file' or 'db'\n"
            self.rawFiles = {}
            self.type = type
            self.validate()
        return
62

aknecht2's avatar
aknecht2 committed
63 64 65 66 67 68 69 70 71
    def printErrors(self):
        """
            Prints out errors from validation and then exits.
        """
        if self.err:
            print self.err.strip()
        else:
            print "No Validation Errors."
        return
72

aknecht2's avatar
aknecht2 committed
73 74 75 76 77 78 79 80 81
    def isValid(self):
        """
            A convenience function.  If validation ran successfully,
            return True, else return False.
        """
        if not self.err:
            return True
        else:
            return False
82

aknecht2's avatar
aknecht2 committed
83 84 85 86 87
    def validate(self):
        """
            This function should be overloaded
        """
        return
88

aknecht2's avatar
aknecht2 committed
89
class Workflow(object):
90

aknecht2's avatar
aknecht2 committed
91 92 93 94 95 96 97 98 99 100 101 102 103
    def __init__(self, template, config, database):
        self.workflow = Validator(template, "file")
        self.config = Validator(config, "file")
        self.db = Validator(database, "db")
        self.err = ""
        if self.workflow.isValid() and self.db.isValid() and self.config.isValid():
            self.validate()
        else:
            self.err += "Workflow components did not validate individually. \n"
            self.err += self.workflow.err
            self.err += self.config.err
            self.err += self.db.err
        return
104

aknecht2's avatar
aknecht2 committed
105 106 107 108 109 110 111 112
    def isValid(self):
        """
        A convenience function.
        """
        if not self.err:
            return True
        else:
            return False
113

aknecht2's avatar
aknecht2 committed
114 115 116 117 118 119
    def printErrors(self):
        if self.err:
            print self.err.strip()
        else:
            print "No Validation Errors"
        return
120

aknecht2's avatar
aknecht2 committed
121 122 123
    def _loadOverwriteArgument(self, job, arg):
        job["arguments"][arg] = conf.valid[job["executable"]]["arguments"][arg]["value"]
        return
124

aknecht2's avatar
aknecht2 committed
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
    def _loadDerivedArgument(self, job, arg, jtype):
        key = conf.valid[job["executable"]]["arguments"][arg]["key"]
        index = conf.valid[job["executable"]]["arguments"][arg]["index"]
        if key in job:
            if isinstance(job[key], list):
                if index <= len(job[key]) - 1:
                    if "value" in conf.valid[job["executable"]]["arguments"][arg]:
                        job["arguments"][arg] = conf.valid[job["executable"]]["arguments"][arg]["value"]
                    else:
                        job["arguments"][arg] = job[key][index]
                else:
                    if "required" in conf.valid[job["executable"]]["arguments"][arg]:
                        self.err += "Workflow, Argument Error: Type '%s' job '%s', derived argument '%s' requires index '%s' for definition '%s', no such index. \n" % (jtype, job["name"], arg, index, key)
            else:
                if "required" in conf.valid[job["executable"]]["arguments"][arg]:
                    self.err += "Workflow, Argument Error: Type '%s' job '%s', derived argument '%s' requires definition for '%s' to be of type list, definition is of type '%s'. \n" % (jtype, job["name"], arg, key, type(job[key]).__name__)
        else:
            if "required" in conf.valid[job["executable"]]["arguments"][arg]:
                self.err += "Workflow, Argument Error: Type '%s' job '%s', derived argument '%s' requires job definition for '%s', no such definition. \n" % (jtype, job["name"], arg, key)
        return
145

aknecht2's avatar
aknecht2 committed
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
    def _validateArgumentType(self, job, arg, type):
        if arg in job["arguments"]:
            if not isinstance(job["arguments"][arg], {
                "list": list,
                "string": (str, unicode),
                "dict": dict,
                "numeric": (int, long, float),
                "exist": object,
                "derived": object
            }[conf.valid[job["executable"]]["arguments"][arg]["type"]]):
                self.err += "Workflow, Argument Error: Type '%s' job '%s', argument '%s' given value '%s', should be of type '%s'. \n" % (type, job["name"], arg, job["arguments"][arg], conf.valid[job["executable"]]["arguments"][arg]["type"])
            else:
                if isinstance(job["arguments"][arg], list) and "join" in conf.valid[job["executable"]]["arguments"][arg]:
                    job["arguments"][arg] = conf.valid[job["executable"]]["arguments"][arg]["join"].join(job["arguments"][arg])
                elif (isinstance(job["arguments"][arg], dict) or isinstance(job["arguments"][arg], list)):
                    job["arguments"][arg] = str(job["arguments"][arg])
                elif (isinstance(job["arguments"][arg], (str, unicode)) and conf.valid[job["executable"]]["arguments"][arg]["type"] and "complex" in conf.valid[job["executable"]]["arguments"][arg]):
                    job["arguments"][arg] = '"' + job["arguments"][arg] + '"'
        return
165

aknecht2's avatar
aknecht2 committed
166 167 168
    def _validateArgumentRequired(self, job, arg, type):
        if "required" in conf.valid[job["executable"]]["arguments"][arg]:
            if arg in job["arguments"]:
169
                if job["arguments"][arg] == "":
aknecht2's avatar
aknecht2 committed
170 171 172 173
                    self.err += "Workflow, Argument Error: Type '%s' job '%s', has empty required argument '%s' \n" % (type, job["name"], arg)
            else:
                self.err += "Workflow, Argument Error: Type '%s' job '%s', requires argument '%s', no such argument found. \n" % (type, job["name"], arg)
        return
174

aknecht2's avatar
aknecht2 committed
175 176 177 178 179 180 181 182
    def _validateArgumentDictionary(self, job, arg, type):
        arglist = job["arguments"][arg] if isinstance(job["arguments"][arg], list) else [job["arguments"][arg]]
        if conf.valid[job["executable"]]["arguments"][arg]["validation"] == "dictionary":
            d = conf.valid[job["executable"]]["arguments"][arg]["value"] if "key" not in conf.valid[job["executable"]]["arguments"][arg] else conf.valid[job["executable"]]["arguments"][arg]["value"].get(job["arguments"][conf.valid[job["executable"]]["arguments"][arg]["key"]])
            for val in arglist:
                if val not in d:
                    self.err += "Workflow, Argument Error: Type '%s' job '%s', has invalid value '%s' for argument '%s'. \n" % (type, job["name"], val, arg)
        return
183

aknecht2's avatar
aknecht2 committed
184 185 186 187 188 189 190
    def _validateArgumentList(self, job, arg, type):
        arglist = job["arguments"][arg] if isinstance(job["arguments"][arg], list) else [job["arguments"][arg]]
        l = conf.valid[job["executable"]]["arguments"][arg]["value"]
        for val in arglist:
            if val not in l:
                self.err += "Workflow, Argument Error: Type '%s' job '%s', has invalid value '%s' for argument '%s'. \n" % (type, job["name"], val, arg)
        return
191

aknecht2's avatar
aknecht2 committed
192 193 194 195 196 197 198 199
    def _validateArguments(self, job, type):
        if not job["arguments"]:
            job["arguments"] = {}
        for arg in conf.valid[job["executable"]]["arguments"]:
            if conf.valid[job["executable"]]["arguments"][arg]["type"] == "derived":
                self._loadDerivedArgument(job, arg, type)
            if "required" in conf.valid[job["executable"]]["arguments"][arg] or arg in job["arguments"]:
                if conf.valid[job["executable"]]["arguments"][arg]["type"] == "overwrite":
200
                    self._loadOverwriteArgument(job, arg)
aknecht2's avatar
aknecht2 committed
201 202 203
                else:
                    self._validateArgumentRequired(job, arg, type)
                    self._validateArgumentType(job, arg, type)
204

aknecht2's avatar
aknecht2 committed
205 206 207 208 209 210
                if "validation" in conf.valid[job["executable"]]["arguments"][arg]:
                    if arg in job["arguments"]:
                        validate = {
                            "dictionary": self._validateArgumentDictionary,
                            "list": self._validateArgumentList
                        }[conf.valid[job["executable"]]["arguments"][arg]["validation"]](job, arg, type)
211

aknecht2's avatar
aknecht2 committed
212 213 214 215
        for arg in job["arguments"]:
            if arg not in conf.valid[job["executable"]]["arguments"]:
                self.err += "Workflow, Argument Error: Type '%s' job '%s', has invalid argument '%s'. \n" % (type, job["name"], arg)
        return
216

aknecht2's avatar
aknecht2 committed
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
    def _validateDependencies(self, job, type):
        try:
            names = [x["name"] for x in self.workflow.data["workflows"][type]]
            unres = copy.deepcopy(job["inputs"])
            for input in job["inputs"]:
                if os.path.isfile(input):
                    unres.remove(input)
                    self.workflow.rawFiles[type].append(input)
                elif input in self.workflow.data["workflows"][type][0]["inputs"]:
                    unres.remove(input)
            if "depends" in job:
                for dependency in job["depends"]:
                    if dependency in names:
                        i = names.index(dependency)
                        for output in self.workflow.data["workflows"][type][i]["outputs"]:
                            if output in unres:
                                unres = [x for x in unres if x != output]
                    else:
                        self.err += "Workflow, Dependency Error: Type '%s' job '%s' depends on job '%s', no such job exists. \n" % (type, job["name"], dependency)
            for val in set(unres):
                self.err += "Workflow, Input Error: Type '%s' job '%s' depends on input '%s'.  Cannot find matching output, or raw file. \n" % (type, job["name"], val)
            for x,input in enumerate(job["inputs"]):
                if "depends" in job:
                    for dependency in job["depends"]:
                        if dependency in names:
                            i = names.index(dependency)
                            if input in self.workflow.data["workflows"][type][i]["outputs"]:
                                j = self.workflow.data["workflows"][type][i]["outputs"].index(input)
                                if conf.valid[job["executable"]]["inputs"][x] != conf.valid[self.workflow.data["workflows"][type][i]["executable"]]["outputs"][j]:
                                    self.err += "Workflow, Dependency Error: Type '%s' job '%s' input '%s' in position '%s' should be of type '%s', however, output '%s' in position '%s' of job '%s' is of type '%s'. \n" %  (type, job["name"], input, x, conf.valid[job["executable"]]["inputs"][x], self.workflow.data["workflows"][type][i]["outputs"][j], j, self.workflow.data["workflows"][type][i]["name"], conf.valid[self.workflow.data["workflows"][type][i]["executable"]]["outputs"][j])
        except:
            self.printErrors()
            print "Validation halted at type '%s' job '%s'.  Fix errors before re-running. \n" % (type, job)
            sys.exit()
        return
252

aknecht2's avatar
aknecht2 committed
253 254 255 256 257 258
    def _validateDB(self):
        """
        Valides the input database file.
        DIFFERENT FOR WORKFLOWS
        OVERLOAD THIS
        """
259

aknecht2's avatar
aknecht2 committed
260
        return
261

aknecht2's avatar
aknecht2 committed
262 263 264 265 266 267 268 269 270 271 272 273
    def _validateConfig(self):
        """
        Validates the input configuration template.
        SAME FOR BOTH TYPES OF WORKFLOWS
        """
        if self.config.data:
            if set(conf.templateKeys["config"]["required"]) < set(self.config.data.keys()):
                if not os.path.isdir(self.config.data["installdir"]):
                    self.err += "Config, Path Error: Path '%s' specified for 'installdir' does not exist. \n" % (self.config.data["installdir"],)
                for key in self.config.data:
                    if key not in conf.templateKeys["config"]["required"] and key not in conf.templateKeys["config"]["optional"]:
                        self.err += "Config, Key Error: Invalid key '%s' specified.  Allowed keys are '%s'. \n" % (key, conf.templateKeys["config"]["required"] + conf.templateKeys["config"]["optional"])
274 275 276 277
                if "maxwalltime" in self.config.data:
                    for key in self.config.data["maxwalltime"]:
                        if key not in conf.templateKeys["config"]["maxwalltime"]["optional"]:
                            self.err += "Config, Key Error: Invalid key '%s' specified for 'maxwalltime'.  Allowed keys are '%s'. \n" % (key, conf.templateKeys["config"]["maxwalltime"]["optional"])
278 279 280 281 282
                if "notify" in self.config.data:
                    for key in conf.templateKeys["config"]["notify"]["required"]:
                        if key not in self.config.data["notify"]:
                            self.err += "Config, Key Error: Required key '%s' not specified for 'notify'.  \n" % (key,)
                        elif key == "pegasus_home":
283 284
                            if not os.path.isfile(self.config.data["notify"]["pegasus_home"] + "/notification/email"):
                                self.err += "Config, Path Error: Required key '%s' has invalid value.  Path '%s' does not exist." % (key, self.config.data["notify"]["pegasus_home"] + "/notification/email")
aknecht2's avatar
aknecht2 committed
285 286 287 288 289
                for namespace in self.config.data["profile"]:
                    for key in self.config.data["profile"][namespace]:
                        if "path" in key.lower():
                            if not isinstance(self.config.data["profile"][namespace][key], list):
                                self.config.data["profile"][namespace][key] = [self.config.data["profile"][namespace][key]]
290 291 292 293 294 295 296
                            if "osg" in self.config.data and namespace == "env":
                                # We ignore osg enviornment variables
                                pass
                            else:
                                for path in self.config.data["profile"][namespace][key]:
                                    if not os.path.isdir(path):
                                        self.err += "Config, Path Error: Path '%s' specified for namespace '%s', key '%s' does not exist. \n" % (path, namespace, key)
297 298 299 300 301
                if "osg" in self.config.data:
                    if "profile" not in self.config.data:
                        self.config.data["profile"] = {}
                    if "env" not in self.config.data["profile"]:
                        self.config.data["profile"]["env"] = {}
aknecht2's avatar
aknecht2 committed
302 303 304 305 306
                    for key in conf.templateKeys["config"]["osg"]["required"]:
                        if key not in self.config.data["osg"]:
                            self.err += "Config, OSG Key Error: Specifying 'osg' requires the key '%s' to be defined." % (key,)
                        elif not os.path.isfile(self.config.data["osg"][key]):
                            self.err += "Config, OSG Path Error: Path '%s' specified for key '%s' does not exist. \n" % (self.config.data["osg"][key], key)
aknecht2's avatar
aknecht2 committed
307 308 309 310 311
            else:
                self.err += "Config, Template Error: Config file does not have all the required keys:" + str(conf.templateKeys["config"]["required"]) + " \n"
        else:
            self.err += "Config, Load Error: Could not load configuration info. \n"
        return
312

aknecht2's avatar
aknecht2 committed
313 314
    def _getImageTypes(self):
        return [x["imtype"] for x in self.db.conn.execute("select distinct imtype from images")]
315

aknecht2's avatar
aknecht2 committed
316 317 318 319
    def validate(self):
        """
        This should be overloaded again.
        """
320 321 322
        return


aknecht2's avatar
aknecht2 committed
323
class ImageProcessor(Workflow):
324

aknecht2's avatar
aknecht2 committed
325 326 327
    def __init__(self, template, config, database):
        super(ImageProcessor, self).__init__(template, config, database)
        return
328

aknecht2's avatar
aknecht2 committed
329 330 331 332 333 334 335 336 337 338 339 340
    def validate(self):
        """
            Validates all inputted files.  Because the workflow is
            dependenet on the config and metadata, workflow validation
            is only done if config and metadata validate successfully.
        """
        if not self.err:
            self._validateConfig()
            self._validateDB()
            if not self.err:
                self._validateWorkflow()
        return
341

aknecht2's avatar
aknecht2 committed
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
    def _validateWorkflowJobs(self):
        types = self._getImageTypes()
        for type in self.workflow.data["workflows"]:
            if type not in types:
                self.err += "Imgproc, Type Error: Workflow definition exists for type '%s', no images have been loaded for that type. \n" % (type,)
            self.workflow.rawFiles[type] = []
            names = [x["name"] for x in self.workflow.data["workflows"][type]]
            if len(names) == len(set(names)):
                for job in self.workflow.data["workflows"][type]:
                    if set(conf.templateKeys["imgproc"]["job"]["required"]) <= set(job.keys()):
                        for key in job.keys():
                            if key not in conf.templateKeys["imgproc"]["job"]["required"] and key not in conf.templateKeys["imgproc"]["job"]["optional"]:
                                self.err += "Imgproc, Key Error: Job '%s' has invalid key '%s' specified.  Allowed keys are '%s'. \n" % (job["name"], key, conf.templateKeys["imgproc"]["job"]["required"] + conf.templateKeys["imgproc"]["job"]["optional"])
                        if job["executable"] in conf.valid:
                            if not os.path.isfile(self.config.data["installdir"] + "/" + job["executable"]):
                                self.err += "Imgproc, Path Error: Job '%s' executable '%s' does not exist. \n" % (job["name"], self.config.data["installdir"] + "/" + job["executable"])
                            if conf.valid[job["executable"]]["type"] == "imgproc":
                                self._validateArguments(job, type)
                                self._validateDependencies(job, type)
                            else:
                                self.err += "Imgproc, Executable Error: Job '%s' has invalid executable '%s' specified.  Only image processing scripts are allowed. \n" % (job["name"], job["executable"])
                        else:
                            self.err += "Imgproc, Executable Error: Job '%s' has non-existant executable '%s' specified. \n" % (job["name"], job["executable"])
                    else:
                        self.err += "Imgproc, Key Error: Job '%s' doesn't have all required keys: '%s'.  \n" % (job["name"], conf.templateKeys["imgproc"]["job"]["required"])
            else:
                self.err += "Imgproc, Name Error: Cannot parse ambiguous workflow.  Workflow defined for type '%s' contains multiple jobs with the same name. \n" % (type,)
        return
370

aknecht2's avatar
aknecht2 committed
371 372 373 374 375 376 377 378
    def _validateWorkflowOptions(self):
        if set(conf.templateKeys["imgproc"]["options"]["required"]) <= set(self.workflow.data["options"].keys()):
            for key in self.workflow.data["options"]:
                if key not in conf.templateKeys["imgproc"]["options"]["required"] and key not in conf.templateKeys["imgproc"]["options"]["optional"]:
                    self.err += "Imgproc, Option Error: Invalid option '%s' specified. \n " % (key,)
        else:
            self.err += "Imgproc, Option Error: Option specification doesn't have all required keys: '%s'. \n " % (conf.templateKeys["imgproc"]["options"]["required"],)
        return
379

aknecht2's avatar
aknecht2 committed
380 381 382 383 384
    def _validateWorkflowExtract(self):
        if set(conf.templateKeys["imgproc"]["extract"]["required"]) <= set(self.workflow.data["extract"].keys()):
            for key in self.workflow.data["extract"]:
                if key not in conf.templateKeys["imgproc"]["extract"]["required"] and key not in conf.templateKeys["imgproc"]["extract"]["optional"]:
                    self.err += "Imgproc, Extract Error: Invalid option '%s' specified. \n " % (key,)
385

aknecht2's avatar
aknecht2 committed
386 387 388 389 390 391 392 393 394 395 396 397 398
            ## Manual hist-bin validation ##
            if "histogram-bin" in self.workflow.data["extract"]:
                hist = {}
                hist["executable"] = "ih-stats-histogram-bin"
                hist["inputs"] = ["images"]
                hist["outputs"] = ["none"]
                hist["name"] = "histogramBin"
                hist["arguments"] = self.workflow.data["extract"]["histogram-bin"].copy()
                self._validateArguments(hist, "extract")
                if not self.err:
                    if set(self.workflow.data["extract"]["histogram-bin"]["--group"].keys()) != set(self.workflow.data["extract"]["histogram-bin"]["--channels"].keys()):
                        self.err += "Imgproc, Extract Error: Histogram bin group names don't match between '--group' and '--channels'. \n"
                else:
399 400
                    pass

aknecht2's avatar
aknecht2 committed
401 402 403
            if not self.err:
                for type in self.workflow.data["extract"]["workflows"]:
                    if type in self.workflow.data["workflows"]:
aknecht2's avatar
aknecht2 committed
404 405
                        if "--dimfromroi" in self.workflow.data["extract"]["workflows"][type]["arguments"]:
                            fpath = self.workflow.data["extract"]["workflows"][type]["arguments"]["--dimfromroi"]
406 407
                            if os.path.isfile(fpath):
                                self.workflow.rawFiles[type].append(fpath)
aknecht2's avatar
aknecht2 committed
408 409 410 411 412 413 414 415 416 417 418 419 420
                        job = self.workflow.data["extract"]["workflows"][type]
                        job["executable"] = "ih-extract"
                        job["outputs"] = ["none"]
                        job["name"] = type + "_extract"
                        self._validateArguments(job, type)
                        self._validateDependencies(job, type)
                    else:
                        self.err += "Imgproc, Extract Error: Extraction specified for type '%s'.  No processing workflow defined for that type." % (type,)
            else:
                pass
        else:
            self.err += "Imgproc, Extract Error: Extract specification doesn't have all required keys: '%s'. \n" % (conf.templateKeys["imgproc"]["extract"]["required"],)
        return
421 422


aknecht2's avatar
aknecht2 committed
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
    def _validateWorkflow(self):
        """
            Validates the input workflow template, making sure all required
            keys are inputted, and all names resolve
        """
        if set(conf.templateKeys["imgproc"]["required"]) <= set(self.workflow.data.keys()):
            for key in self.workflow.data.keys():
                if key not in conf.templateKeys["imgproc"]["required"] and key not in conf.templateKeys["imgproc"]["optional"]:
                    self.err += "Imgproc, Key error: Invalid key '%s' specified.  Allowed keys are '%s'. \n" % (key, conf.templateKeys["imgproc"]["required"] + conf.templateKeys["imgproc"]["optional"])
            try:
                self._validateWorkflowJobs()
                self._validateWorkflowOptions()
                self._validateWorkflowExtract()
            except Exception as e:
                print traceback.format_exc()
                self.err += "Validation halted.  Fix current issues before re-running. \n"
                self.printErrors()
        else:
            self.err += "Imgproc, Template Error: Workflow file does not have all the required keys: '%s'. \n" % (conf.templateKeys["imgproc"]["required"],)
        return
443

aknecht2's avatar
aknecht2 committed
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
    def _validateDB(self):
        """
            Validates the inputted metadata db, ensuring the appropriate
            column names are in the database.
        """
        if self.db:
            cols = [row[1] for row in self.db.conn.execute("PRAGMA table_info(images)")]
            if set(conf.outHeaders) <= set(cols):
                testpath = self.db.conn.execute("select path from images limit 0,1").next()[0]
                if not os.path.isfile(testpath):
                    self.err += "DB, Path Error: Test file: '%s' could not be found. \n" % (testpath,)
            else:
                 self.err += "DB, Key Error: Meta-data file does not have all the required headers: %s \n" % (str(conf.outHeaders),)
        else:
            self.err += "DB, Load Error: Could not connect to meta-data db. \n"
        return
460 461 462



aknecht2's avatar
aknecht2 committed
463
class Statistics(Workflow):
464

aknecht2's avatar
aknecht2 committed
465 466
    def __init__(self, template, config, database):
        super(Statistics, self).__init__(template, config, database)
467

aknecht2's avatar
aknecht2 committed
468
        return
469

aknecht2's avatar
aknecht2 committed
470 471 472 473 474 475 476 477 478 479 480 481
    def validate(self):
        """
            Validates all inputted files.  Because the workflow is
            dependenet on the config and metadata, workflow validation
            is only done if config and metadata validate successfully.
        """
        if not self.err:
            self._validateConfig()
            self._validateDB()
            if not self.err:
                self._validateWorkflow()
        return
482

aknecht2's avatar
aknecht2 committed
483 484 485 486 487 488 489 490 491 492 493 494 495
    def _validateDB(self):
        try:
            result = self.db.conn.execute("select * from images")
            cols = [row[1] for row in self.db.conn.execute("PRAGMA table_info(images)")]
            if not set(cols) > set(conf.outHeaders):
                self.err += "Stats, Database Error: Database specified does not contain any numeric columns, process images first. \n"
            if not set(conf.outHeaders) <= set(cols):
                self.err += "Stats, Database Error: Database specified does not contain all the required column names: '%s'. \n " % (conf.outHeaders,)
            if not result.fetchone():
                self.err += "Stats, Database Error: Database specified does not contain an image table with any entries. \n"
        except sqlite3.DatabaseError:
            self.err += "Stats, Database Error: File specified is not a valid database file. \n"
        return
496

aknecht2's avatar
aknecht2 committed
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
    def _validateWorkflow(self):
        if self.workflow.data:
            for key in self.workflow.data.keys():
                if key not in conf.templateKeys["stats"]["required"] and key not in conf.templateKeys["stats"]["optional"]:
                    self.err += "Stats, Key error: Invalid key '%s' specified.  Allowed keys are '%s'. \n" % (key, conf.templateKeys["stats"]["required"] + conf.templateKeys["stats"]["optional"])
            for type in self.workflow.data["workflows"]:
                self.workflow.rawFiles[type] = []
                names = [x["name"] for x in self.workflow.data["workflows"][type]]
                if len(names) == len(set(names)):
                    for job in self.workflow.data["workflows"][type]:
                        if set(conf.templateKeys["stats"]["job"]["required"]) <= set(job.keys()):
                            for key in job.keys():
                                if key not in conf.templateKeys["stats"]["job"]["required"] and key not in conf.templateKeys["stats"]["job"]["optional"]:
                                    self.err += "Stats, Key Error: Job '%s' has invalid key '%s' specified.  Allowed keys are '%s'. \n" % (job["name"], key, conf.templateKeys["stats"]["job"]["required"] + conf.templateKeys["stats"]["job"]["optional"])
                            if job["executable"] in conf.valid:
                                if conf.valid[job["executable"]]["type"] == "statistics":
                                    if os.path.isfile(self.config.data["installdir"] + "/" + job["executable"]):
                                        self._validateArguments(job, type)
                                        self._validateDependencies(job, type)
                                    else:
                                        self.err += "Stats, Path Error: Job '%s' executable '%s' does not exist. \n" % (job["name"], self.config["installdir"] + "/" + job["executable"])
                                else:
                                    self.err += "Stats, Executable Error: Job '%s' has invalid executable '%s' specified.  Only statistics scripts are allowed. \n" % (job["name"], job["executable"])
                            else:
                                self.err += "Stats, Executable Error: Job '%s' executable '%s' is not a valid executable. \n" % (job["name"], job["executable"])
                        else:
                            self.err += "Stats, Key Error: Job '%s' doesn't have all required keys: '%s'. \n" % (job["name"], conf.templateKeys["stats"]["job"]["required"])
                else:
                    self.err += "Stats, Name Error: Cannot parse ambiguous workflow.  Workflow defined for type '%s' contains multiple jobs with the same name. \n" % (type,)
        else:
            self.err += "Stats, Load Error: Could not load stats file. \n"
        return
529

aknecht2's avatar
aknecht2 committed
530
class ImageLoader(Validator):
531

aknecht2's avatar
aknecht2 committed
532 533 534
    def __init__(self, f):
        super(ImageLoader, self).__init__(f, "file")
        return
535

aknecht2's avatar
aknecht2 committed
536 537 538 539 540 541 542 543 544
    def validate(self):
        """
            Validates the given template.
        """
        if self.isValid():
            if set(conf.templateKeys["loading"]["required"]) <= set(self.data.keys()):
                for key in self.data:
                    if key not in conf.templateKeys["loading"]["required"] and key not in conf.templateKeys["loading"]["optional"]:
                        self.err += "Loader, Key Error: Invalid key '%s' specified. \\n" % (key,)
545

aknecht2's avatar
aknecht2 committed
546 547 548 549 550 551 552 553 554 555 556 557 558
                if not self.err:
                    for key in self.data:
                        d = {
                            "path": self._validatePath,
                            "base": self._validateBase,
                            "data": self._validateData,
                            "translations": self._validateTranslations,
                            "order": self._validateOrder,
                            "filetype": self._validateFtype,
                        }[key]()
            else:
                self.err += "Loader, Key Error: Not all required keys specified: '%s' \n" % (conf.templateKeys["loading"]["required"])
        return
559

aknecht2's avatar
aknecht2 committed
560 561 562 563 564 565
    def _validateFtype(self):
        if self.data["filetype"]:
            if self.data["filetype"] not in conf.imageExtensions:
                self.err += "Loader, File Error: Value '%s' specified for 'filetype' is invalid.  Must be on of '%s'. \n" % (self.data["filetype"], conf.imageExtensions)
        else:
            self.err += "Loader, File Error: No value specified for 'filetype'. \n"
566

aknecht2's avatar
aknecht2 committed
567 568 569 570 571 572 573 574
    def _validatePath(self):
        """
            Validates the path.  Because the path contains potential references, path existence
            cannot be checked at this stage.
        """
        if not self.data["path"]:
            self.err += "Loader, Path Error: No value specified for 'path'.\n"
        return
575

aknecht2's avatar
aknecht2 committed
576 577 578 579
    def _validateBase(self):
        """
            Validates the base walk path.  Checks existence of a value, if the value
            is a subset of path, and if the path exists.
580
        """
aknecht2's avatar
aknecht2 committed
581 582 583 584 585 586 587 588 589
        if self.data["base"]:
            if self.data["base"] in self.data["path"]:
                if not os.path.exists(self.data["base"]):
                    self.err += "Loader, Base Error: Value for 'base' is not a valid path. \n"
            else:
                self.err += "Loader, Base Error: Value for 'base' is not a subset of 'path'. \n"
        else:
            self.err += "Loader, Base Error: No value specified for 'base'.\n"
        pass
590

aknecht2's avatar
aknecht2 committed
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
    def _validateData(self):
        """
            Validates the data!  Uses a seperate function for each data type.
        """
        if self.data["data"]:
            for key in self.data["data"]:
                if "type" in self.data["data"][key]:
                    if self.data["data"][key]["type"] in conf.templateKeys["loading"]["data"]["type"]:
                        if key in self.data["order"]:
                            d = {
                                "value": self._validateDataValue,
                                "file": self._validateDataFile,
                                "date": self._validateDataDate
                            }[self.data["data"][key]["type"]](key)
                        else:
                            self.err += "Loader, Data Error: Definition for data key '%s', no corresponding definition in 'order'. \n" % (key,)
                    else:
                        self.err += "Loader, Data Error: Invalid type '%s' specified for data key '%s', must be one of '%s'. \n" % (self.data["data"][key]["type"], key, conf.templateKeys["loading"]["data"]["type"])
                else:
                    self.err += "Loader, Data Error: No value specified for 'type' of data key '%s'. \n" % (key,)
        else:
            self.err += "Loader, Value Error: No value specified for 'data'. \n"
        pass
614

aknecht2's avatar
aknecht2 committed
615 616 617 618 619 620 621 622 623
    def _validateDataValue(self, key):
        """
            Validates type 'value'.  Values can be combinations of hard-coded text
            as well as references to path identifiers  Checks existence of a value,
            if all the references are valid, and if translation is specified, that
            translations exist.
        """
        if self.data["data"][key]["value"]:
            if set(conf.templateKeys["loading"]["data"]["value"]["required"]) <= set(self.data["data"][key].keys()):
624

aknecht2's avatar
aknecht2 committed
625 626 627
                for subkey in self.data["data"][key]:
                    if subkey not in conf.templateKeys["loading"]["data"]["value"]["required"] and subkey not in conf.templateKeys["loading"]["data"]["value"]["optional"]:
                        self.err += "Loader, Data Value Error: Invalid key '%s' specified for data key '%s'. \n" % (subkey,key)
628

aknecht2's avatar
aknecht2 committed
629 630 631 632 633 634 635
                m = re.findall(r"%(\w+)%", self.data["data"][key]["value"])
                for val in m:
                    if "%" + val + "%" not in self.data["path"]:
                        self.err += "Loader, Data Value Reference Error: Could not reference identifier '%s' for data key '%s'. \n" % (val, key)
                if "translate" in self.data["data"][key]:
                    if key not in self.data["translations"]:
                        self.err += "Loader, Data Value Translation Error: Can't translate data key '%s', no translation specified. \n" % (key,)
636 637 638
                if "case" in self.data["data"][key]:
                    if self.data["data"][key]["case"] not in ["lower", "upper"]:
                        self.err += "Loader, Data Value Error: Case for key '%s' is defined as '%s', should be either 'lower' or 'upper'. \n" % (key, self.data["data"][key]["case"])
aknecht2's avatar
aknecht2 committed
639 640 641 642 643 644 645
            else:
                print set(conf.templateKeys["loading"]["data"]["value"]["required"])
                print set(self.data["data"][key].keys())
                self.err += "Loader, Data Value Error: Data key '%s' does not have all required keys '%s'. \n" % (key, conf.templateKeys["loading"]["data"]["value"]["required"])
        else:
            self.err += "Loader, Data Value Error: No value specified for data key '%s'. \n " % (key,)
        return
646

aknecht2's avatar
aknecht2 committed
647 648 649 650 651 652 653
    def _validateDataFile(self, key):
        """
            Validates type 'file'.  Checks to ensure that a file is specified,
            that file exists, and that a value is specified for key, keyColumn,
            and valueColumn.  Assumes the file is of csv format.
        """
        if set(conf.templateKeys["loading"]["data"]["file"]["required"]) <= set(self.data["data"][key].keys()):
654

aknecht2's avatar
aknecht2 committed
655 656 657
            for subkey in self.data["data"][key]:
                if subkey not in conf.templateKeys["loading"]["data"]["file"]["required"] and subkey not in conf.templateKeys["loading"]["data"]["file"]["optional"]:
                    self.err += "Loader, Data File Error: Invalid key '%s' specified for data key '%s'. \n" % (subkey, key)
658

aknecht2's avatar
aknecht2 committed
659 660 661 662 663 664 665 666 667 668 669
            if os.path.exists(self.data["data"][key]["value"]):
                with open(self.data["data"][key]["value"], "r") as rh:
                    firstline = rh.readline()
                    sep = self.data["data"][key]["separator"] if "separator" in self.data["data"][key] else ","
                    maxlength = len(firstline.split(sep))
                    if maxlength <= self.data["data"][key]["keyColumn"]:
                        self.err += "Loader, Data File Error: Key column for data key '%s' out of range. \n" % (key,)
                    if maxlength <= self.data["data"][key]["valueColumn"]:
                        self.err += "Loader, Data File Error: Value column for data key '%s' out of range. \n" % (key,)
            else:
                self.err += "Loader, Data Filer Error: File '%s' specified for data key '%s' does not exist. \n" % (self.data["data"][key]["value"], key)
670

aknecht2's avatar
aknecht2 committed
671 672 673 674 675 676 677
            m = re.findall(r"%(\w+)%", self.data["data"][key]["key"])
            for val in m:
                if "%" + val + "%" not in self.data["path"]:
                    self.err += "Loader, Data Value Reference Error: Could not reference identifier '%s' for data key '%s'. \n" % (val, key)
        else:
            self.err += "Loader, Data File Error: Data key '%s' does not have all required keys '%s'. \n" % (key, conf.templateKeys["loading"]["data"]["file"]["required"])
        return
678

aknecht2's avatar
aknecht2 committed
679 680 681 682 683 684 685
    def _validateDataDate(self, key):
        """
            Validates type 'date'.  Checks to ensure that a value exists, a format
            exists, and that the format is semi-valid.  True validity of format is
            checked when values are given in the crawling step.
        """
        if set(conf.templateKeys["loading"]["data"]["date"]["required"]) <= set(self.data["data"][key].keys()):
686

aknecht2's avatar
aknecht2 committed
687 688 689
            for subkey in self.data["data"][key]:
                if subkey not in conf.templateKeys["loading"]["data"]["date"]["required"] and subkey not in conf.templateKeys["loading"]["data"]["date"]["optional"]:
                    self.err += "Loader, Data Date Error: Invalid key '%s' specified for data key '%s'. \n" % (subkey, key)
690

aknecht2's avatar
aknecht2 committed
691 692
            if not set(self.data["data"][key]["format"]) <= set(conf.dateFormat + conf.dateSep):
                self.err += "Loader, Data Date Error: Invalid values in data key '%s', format only supports '%s'. \n" % (key, conf.dateFormat + conf.dateSep)
693

aknecht2's avatar
aknecht2 committed
694 695 696 697 698 699 700
            m = re.findall(r"%(\w+)%", self.data["data"][key]["value"])
            for val in m:
                if "%" + val + "%" not in self.data["path"]:
                    self.err += "Loader, Data Value Reference Error: Could not reference identifier '%s' for data key '%s'. \n" % (val, key)
        else:
            self.err += "Loader, Data Date Error: Data key '%s' does not have all required keys '%s'. \n" % (key, conf.templateKeys["loading"]["data"]["date"]["required"])
        return
701

aknecht2's avatar
aknecht2 committed
702 703 704 705 706 707 708 709
    def _validateTranslations(self):
        """
            Nothing to see here, move along.
        """
        for key in self.data["translations"]:
            if key not in self.data["data"]:
                self.err += "Loader, Translation Error: Translation defined for '%s', however, no such data value exists. \n" % (key,)
        return
710

aknecht2's avatar
aknecht2 committed
711 712 713 714 715 716 717 718 719 720 721 722
    def _validateOrder(self):
        """
            Validates the order -- the order you want to write values to the csv.
            Checks existence of a value, as well as ensuring all values specified
            have corresponding data values.
        """
        if self.data["order"]:
            if not set(self.data["order"]) == set(self.data["data"].keys() + ["path"]):
                self.err += "Loader, Order Error: Order must contain ALL data keys AND 'path'. \n"
        else:
            self.err += "Loader, Order Error: No value specified for order. \n"
        return