Commit 12d2ea5a authored by aknecht2's avatar aknecht2
Browse files

Merge branch '3-method-auto-doc' into 'master'

Resolve "Method Auto Doc"

Closes #3

See merge request !25
parents 1d2ebea3 da85b698
...@@ -12,18 +12,23 @@ import hashlib ...@@ -12,18 +12,23 @@ import hashlib
class MongoDB(object): class MongoDB(object):
"""
:param host: The host address of the MongoDB database.
:type host: str
:param username: The username of the account for the MongoDB database.
:type username: str
:param password: The password for the user.
:type password: str
:param debug: A flag for printing additional messages.
:type debug: bool
This class is used to manage all interactions with the encode metadata.
The metadata can be very unruly and difficult to deal with. There
are several helper functions within this class to make some database
operations much easier.
"""
def __init__(self, host, username, password, debug=False): def __init__(self, host, username, password, debug=False):
"""
:param host: The host address of the MongoDB database.
:type host: str
:param username: The username of the account for the MongoDB database.
:type username: str
:param password: The password for the user.
:type password: str
:param debug: If true print out debug messages
:type debug: bool
"""
self.debug = debug self.debug = debug
self.host = host self.host = host
self.username = username self.username = username
...@@ -48,6 +53,12 @@ class MongoDB(object): ...@@ -48,6 +53,12 @@ class MongoDB(object):
:type key: Any hashable :type key: Any hashable
:param data: The data to add to the cache. :param data: The data to add to the cache.
:type data: Object :type data: Object
Adds a data result to the internal cache. This is used to speed up
requests that are identical. We may have multiple runs that use
identical control / signal files but change around the alignment or
peak calling tools. In these cases we don't want to request info
from the database multiple times for the same data.
""" """
if function not in self.cache: if function not in self.cache:
self.cache[function] = {} self.cache[function] = {}
...@@ -60,6 +71,8 @@ class MongoDB(object): ...@@ -60,6 +71,8 @@ class MongoDB(object):
:type function: str :type function: str
:param key: The key to get from the cache. :param key: The key to get from the cache.
:type key: Any hashable :type key: Any hashable
Gets a data item from the internal cache.
""" """
if function in self.cache: if function in self.cache:
if key in self.cache[function]: if key in self.cache[function]:
...@@ -69,9 +82,9 @@ class MongoDB(object): ...@@ -69,9 +82,9 @@ class MongoDB(object):
def delete_result(self, result, genome): def delete_result(self, result, genome):
""" """
:param result: The result to delete :param result: The result to delete
:type result: :py:class:~chipathlon.result.Result :type result: :py:class:`~chipathlon.result.Result`
:param genome: The genome to find information from. :param genome: The genome to find information from.
:type genome: :py:meth:~chipathlon.genome.Genome :type genome: :py:class:`~chipathlon.genome.Genome`
Deletes a result and it's corresponding gridfs entry. Deletes a result and it's corresponding gridfs entry.
""" """
...@@ -114,11 +127,13 @@ class MongoDB(object): ...@@ -114,11 +127,13 @@ class MongoDB(object):
def result_exists(self, result, genome): def result_exists(self, result, genome):
""" """
:param result: The result to check. :param result: The result to check.
:type result: :py:meth:~chipathlon.result.Result :type result: :py:meth:`~chipathlon.result.Result`
:param genome: The genome to find information from. :param genome: The genome to find information from.
:type genome: :py:meth:~chipathlon.genome.Genome :type genome: :py:meth:`~chipathlon.genome.Genome`
Check if a result exists. Check if a result exists in the database. The genome parameter
is required since some files have been aligned or use individual
chromsome fasta or size files for peak calling.
""" """
try: try:
cursor = self.db.results.find(self._get_result_query(result, genome)) cursor = self.db.results.find(self._get_result_query(result, genome))
...@@ -130,11 +145,12 @@ class MongoDB(object): ...@@ -130,11 +145,12 @@ class MongoDB(object):
def get_result_id(self, result, genome): def get_result_id(self, result, genome):
""" """
:param result: The result to check. :param result: The result to check.
:type result: :py:meth:~chipathlon.result.Result :type result: :py:meth:`~chipathlon.result.Result`
:param genome: The genome to find information from. :param genome: The genome to find information from.
:type genome: :py:meth:~chipathlon.genome.Genome :type genome: :py:meth:`~chipathlon.genome.Genome`
:returns: The id found or None
Get the id of a result. Get the id of a result in the database.
""" """
try: try:
cursor = self.db.results.find(self._get_result_query(result, genome)) cursor = self.db.results.find(self._get_result_query(result, genome))
...@@ -177,8 +193,11 @@ class MongoDB(object): ...@@ -177,8 +193,11 @@ class MongoDB(object):
:param gfs_attributes: Additional metadata to store in gridfs. :param gfs_attributes: Additional metadata to store in gridfs.
:type gfs_attributes: dict :type gfs_attributes: dict
Saves a result file into mongodb and also creates the corresponding Saves a result entry into MongodDB and uploads the file into gridfs.
gridfs file. The only difference between additional_data and gfs_attributes is the
location the metadata is stored. Both just store key value pairs of
information, the additional_data information is stored in the result
entry, the gfs_attributes information is stored in gridfs.
""" """
# Make sure output_file exists # Make sure output_file exists
if os.path.isfile(output_file): if os.path.isfile(output_file):
...@@ -218,6 +237,7 @@ class MongoDB(object): ...@@ -218,6 +237,7 @@ class MongoDB(object):
""" """
:param sample_accession: The accession number to check. :param sample_accession: The accession number to check.
:type sample_accession: str :type sample_accession: str
:returns: Whether or not the sample is valid.
Ensures that a sample with the accession specified actually exists. Ensures that a sample with the accession specified actually exists.
""" """
...@@ -235,6 +255,7 @@ class MongoDB(object): ...@@ -235,6 +255,7 @@ class MongoDB(object):
""" """
:param experiment_accession: The accession number to check. :param experiment_accession: The accession number to check.
:type experiment_accession: str :type experiment_accession: str
:returns: Whether or not the experiment is valid
Ensures that an experiment with the accession specified actually exists. Ensures that an experiment with the accession specified actually exists.
""" """
...@@ -252,15 +273,15 @@ class MongoDB(object): ...@@ -252,15 +273,15 @@ class MongoDB(object):
def fetch_from_gridfs(self, gridfs_id, filename, checkmd5=True): def fetch_from_gridfs(self, gridfs_id, filename, checkmd5=True):
""" """
:param gridfs_id: GridFS _id of file to get. :param gridfs_id: GridFS _id of file to get.
:type gridfs_id: bson.objectid.ObjectId :type gridfs_id: :py:class:`bson.objectid.ObjectId`
:param filename: Filename to save file to. :param filename: Filename to save file to.
:type filename: str :type filename: str
:param checkmd5: Whether or not to validate the md5 of the result :param checkmd5: Whether or not to validate the md5 of the result
:type checkmd5: bool :type checkmd5: bool
Fetch the file with the corresponding id and save it under the Fetch the file with the corresponding id and save it under the
specified 'filename'. If checkmd5 is specified, validate that the saved specified 'filename'. If checkmd5 is specified, validate that the
file has a correct md5 value. saved file has a correct md5 value.
""" """
try: try:
gridfs_file = self.gfs.get(gridfs_id) gridfs_file = self.gfs.get(gridfs_id)
...@@ -298,10 +319,13 @@ class MongoDB(object): ...@@ -298,10 +319,13 @@ class MongoDB(object):
""" """
:param accession: The accession number of the target sample :param accession: The accession number of the target sample
:type accession: string :type accession: string
:param file_type: The file type of the target sample should be [fastq|bam] :param file_type: The file type of the target sample.
:type file_type: string :type file_type: string
Gets the associated sample based on accession number and file_type Gets the associated sample based on accession number and file_type.
For loading input files for workflows the file_type should be fastq
or bam. Other file types can be specified for loading additional files
saved in the experiment metadata.
""" """
valid = True valid = True
msg = "" msg = ""
......
...@@ -3,18 +3,22 @@ import chipathlon.conf ...@@ -3,18 +3,22 @@ import chipathlon.conf
from Pegasus.DAX3 import File, PFN from Pegasus.DAX3 import File, PFN
class Genome(object): class Genome(object):
"""
:param assembly: Version of genome used for building i.e. hg19, grch38p6, mm9
:type assembly: string
:param tool: Tool used to create the genome.
:type tool: string
:param base_file: Main genome file probably a .fna or .fa file.
:type base_file: string
:param chrom_sizes: Chromsome sizes file.
:type chrom_sizes: string
The genome handles loading and validating genome files on the disk.
It servers as a helper class to make managing genome input files
much easier.
"""
def __init__(self, assembly, tool, base_file, chrom_sizes): def __init__(self, assembly, tool, base_file, chrom_sizes):
"""
:param assembly: Version of genome used for building i.e. hg19, grch38p6, mm9
:type assembly: string
:param tool: Tool used to create the genome.
:type tool: string
:param base_file: Main genome file probably a .fna or .fa file.
:type base_file: string
:param chrom_sizes: Chromsome sizes file.
:type chrom_sizes: string
"""
self.assembly = assembly self.assembly = assembly
self.tool = tool self.tool = tool
self.base_file = base_file self.base_file = base_file
...@@ -38,7 +42,7 @@ class Genome(object): ...@@ -38,7 +42,7 @@ class Genome(object):
def is_valid(self): def is_valid(self):
""" """
Checks if the run is valid. Checks if the genome is valid.
""" """
return len(self.errors) == 0 return len(self.errors) == 0
...@@ -49,18 +53,40 @@ class Genome(object): ...@@ -49,18 +53,40 @@ class Genome(object):
return "\n".join(self.errors) return "\n".join(self.errors)
def get_base_file(self): def get_base_file(self):
"""
:returns: The full name of the base file.
"""
return self.files.get("base_file") return self.files.get("base_file")
def get_chrom_sizes(self): def get_chrom_sizes(self):
"""
:returns: The full name of the chromosome sizes file.
"""
return self.files.get("chrom.sizes") return self.files.get("chrom.sizes")
def get_additional_files(self): def get_additional_files(self):
"""
:returns: A list of all additional genome indices.
For bwa returns the file names of the .amb, .ann, .bwt, .pac, and .sa
files. For bowtie2 returns the file names of the .1.bt2, .2.bt2,
.3.bt2, .4.bt2, .rev.1.bt2, and .rev.2.bt2 files.
"""
return self.files.get("additional_files") return self.files.get("additional_files")
def get_chr_fasta_files(self): def get_chr_fasta_files(self):
"""
:returns: A list of all individual chromsome fasta files (if they exist)
"""
return self.files.get("chr_fasta") return self.files.get("chr_fasta")
def get_all_files(self): def get_all_files(self):
"""
:returns: A list of all files.
The base file, the chromsome sizes file, all additional genome indices,
and all individual chromosme fasta files.
"""
return [self.get_base_file(), self.get_chrom_sizes()] + self.get_additional_files() + self.get_chr_fasta_files() return [self.get_base_file(), self.get_chrom_sizes()] + self.get_additional_files() + self.get_chr_fasta_files()
def _load_prefixes(self): def _load_prefixes(self):
......
...@@ -5,42 +5,38 @@ from Pegasus.DAX3 import File, PFN ...@@ -5,42 +5,38 @@ from Pegasus.DAX3 import File, PFN
class Result(object): class Result(object):
""" """
A class containing information about a result file. :param logical_name: The unique name of the file as presented in the module yaml
i.e. MongoDB metadata, as well as all jobs and arguments :type logical_name: string
run on the result file up to this point. :param control_samples: The sample data for control files as taken from MongoDB
:type control_samples: list
:param signal_samples: The sample data for signal files as taken from MongoDB
:type signal_samples: list
:param all_markers: The markers for the entire workflow so far. Should be indexed by module.
:type all_markers: dict
:param all_jobs: A list of jobs for the entire workflow so far.
:type all_jobs: list
:param should_save: Whether or not the result should be saved to the database.
:type should_save: boolean
:param prefix_join: How to combine prefixes for the final result
:type prefix_join str
:param name_template: A template to load the correct name of the result
:type name_template: str
:param last_result: A boolean to determine if result is the last result in a module.
:type last_result: boolean
The result class is for managing all intermediate output files. A result
contains all information about a file -- all MongoDB metadata, as well as
jobs and their arguments that have been run on the file up to this point.
The full name of the fill will be a prefix + the given logical name
Consider a bwa paired end read alignment, say we our two paired ends
are control files with accession ENCF0001 and ENCF0002 respectively.
The prefix would be computed as ENCF0001_ENCF0002_bwa_paired_
The final output file from align would be
ENCF0001_ENCF0002_bwa_paired_align.bam.
""" """
def __init__(self, logical_name, control_samples, signal_samples, all_markers, all_jobs, should_save=False, prefix_join=None, name_template=None, last_result=False): def __init__(self, logical_name, control_samples, signal_samples, all_markers, all_jobs, should_save=False, prefix_join=None, name_template=None, last_result=False):
"""
:param logical_name: The unique name of the file as presented in the module yaml
:type logical_name: string
:param control_samples: The sample data for control files as taken from MongoDB
:type control_samples: list
:param signal_samples: The sample data for signal files as taken from MongoDB
:type signal_samples: list
:param all_markers: The markers for the entire workflow so far. Should be indexed by module.
:type all_markers: dict
:param all_jobs: A list of jobs for the entire workflow so far.
:type all_jobs: list
:param should_save: Whether or not the result should be saved to the database.
:type should_save: boolean
:param prefix_join: How to combine prefixes for the final result
:type prefix_join str
:param name_template: A template to load the correct name of the result
:type name_template: str
:param last_result: A boolean to determine if result is the last result in a module.
:type last_result: boolean
The result class is for managing all intermediate output files.
It also helps manage checking if a result already exists for the
purpose of generators creating jobs.
The full name of the fill will be a prefix + the given logical name
Consider a bwa paired end read alignment, say we our two paired ends
are control files with accession ENCF0001 and ENCF0002 respectively.
The prefix would be computed as ENCF0001_ENCF0002_bwa_paired_
The final output file from align would be ENCF0001_ENCF0002_bwa_paired_align.bam
"""
self.logical_name = logical_name self.logical_name = logical_name
self.all_markers = all_markers self.all_markers = all_markers
self.control_samples = control_samples self.control_samples = control_samples
...@@ -96,7 +92,9 @@ class Result(object): ...@@ -96,7 +92,9 @@ class Result(object):
:param val: The meta value. :param val: The meta value.
:type val: Any object :type val: Any object
Adds a metadata value. Adds a metadata value. This function is pretty generic, but sometimes
we want to include additional information on result files that aren't
metadata that already exist in encode.
""" """
self.meta[key] = val self.meta[key] = val
return return
...@@ -106,7 +104,9 @@ class Result(object): ...@@ -106,7 +104,9 @@ class Result(object):
:param key: The meta key. :param key: The meta key.
:type key: hashable :type key: hashable
Gets a stored metadata value. Gets a stored metadata value. This function is pretty generic, but
sometimes we want to include additional information on result files
that aren't metadata that already exist in encode.
""" """
return self.meta.get(key) return self.meta.get(key)
...@@ -127,8 +127,7 @@ class Result(object): ...@@ -127,8 +127,7 @@ class Result(object):
:type mdb: MongoDB :type mdb: MongoDB
:param genome: The genome class containing genomic information :param genome: The genome class containing genomic information
:type genome: Genome :type genome: Genome
:returns: Whether or not the result exists in the database.
Checks whether or not the result exists in the database.
""" """
return mdb.result_exists(self, genome) return mdb.result_exists(self, genome)
......
...@@ -4,26 +4,27 @@ from pprint import pprint ...@@ -4,26 +4,27 @@ from pprint import pprint
class Run(object): class Run(object):
""" """
A class representing an individual smallest workflow. :param genome: The genome class containing alignment, assembly and file information.
:type genome: :py:class:`~chipathlon.genome.Genome`
:param peak: The peak calling method to be used.
:type peak: string
:param signals: List of accessions for signal samples
:type signals: string
:param controls: List of accession for control samples
:type controls: list
:param file_type: The base file type, should be one of [fastq|bam]
:type file_type: string
:param peak_type: The peak calling type, should be on of [narrow|broad]
:type peak_type: string
:param idr: A list of two signal accessions corresponding to which peak results to run with idr.
:type idr: boolean
A run has everything necessary for a single workflow to be run from start
to finish. That is, definitions of tools to use, as well as a list of
control and signal input files to use them on.
""" """
def __init__(self, genome, peak, signals, controls, file_type, peak_type=None, idr=None): def __init__(self, genome, peak, signals, controls, file_type, peak_type=None, idr=None):
"""
:param genome: The genome class containing alignment, assembly and file information.
:type genome: :py:class:chipathlon.genome.Genome
:param peak: The peak calling method to be used.
:type peak: string
:param signals: List of accessions for signal samples
:type signals: string
:param controls: List of accession for control samples
:type controls: list
:param file_type: The base file type, should be one of [fastq|bam]
:type file_type: string
:param peak_type: The peak calling type, should be on of [narrow|broad]
:type peak_type: string
:param idr: A list of two signal accessions corresponding to which peak results to run with idr.
:type idr: boolean
"""
self.genome = genome self.genome = genome
self.peak = peak self.peak = peak
self.signals = signals self.signals = signals
...@@ -71,14 +72,14 @@ class Run(object): ...@@ -71,14 +72,14 @@ class Run(object):
def _load_sample_data(self, mdb, accession, sample_type): def _load_sample_data(self, mdb, accession, sample_type):
""" """
Loads records from the database for an individual accession
:param mdb: MongoDB instance to fetch samples from. :param mdb: MongoDB instance to fetch samples from.
:type mdb: :py:class:chipathlon.db.MongoDB :type mdb: :py:class:`~chipathlon.db.MongoDB`
:param accession: Accession to load data for. :param accession: Accession to load data for.
:type accession: string :type accession: string
:param sample_type: Whether the sample is a signal or control file. Should be [signal|control] :param sample_type: Whether the sample is a signal or control file. Should be [signal|control]
:type sample_type: string :type sample_type: string
Loads records from the database for an individual accession
""" """
if accession is not None: if accession is not None:
valid, msg, sample = mdb.get_sample(accession, self.file_type) valid, msg, sample = mdb.get_sample(accession, self.file_type)
...@@ -97,10 +98,11 @@ class Run(object): ...@@ -97,10 +98,11 @@ class Run(object):
def load_samples(self, mdb): def load_samples(self, mdb):
""" """
:param mdb: MongoDB class instance :param mdb: MongoDB class instance
:type mdb: :py:class:chipathlon.db.MongoDB :type mdb: :py:class:`~chipathlon.db.MongoDB`
:returns: None
Loads samples based on the accessions defined in self.signals and Loads samples based on the accessions defined in self.signals and
self.controls self.controls.
""" """
for accession in self.signals: for accession in self.signals:
self._load_sample_data(mdb, accession, "signal") self._load_sample_data(mdb, accession, "signal")
...@@ -110,13 +112,13 @@ class Run(object): ...@@ -110,13 +112,13 @@ class Run(object):
def is_valid(self): def is_valid(self):
""" """
Checks if the run is valid. :returns: Whether or not the run is valid.
""" """
return len(self.errors) == 0 return len(self.errors) == 0
def get_error_string(self): def get_error_string(self):
""" """
Returns the errors as a newline separated string. :returns: The errors as a newline separated string.
""" """
return "\n".join(self.errors) return "\n".join(self.errors)
...@@ -124,22 +126,22 @@ class Run(object): ...@@ -124,22 +126,22 @@ class Run(object):
""" """
:param sample_type: Type of samples to retrieve either control or signal :param sample_type: Type of samples to retrieve either control or signal
:type sample_type: str :type sample_type: str
:returns: The samples if they exist else none
Gets the mongodb metadata associated with all samples for the run. Gets the mongodb metadata associated with all samples for the run.
""" """
if sample_type in self.samples: return self.samples.get(sample_type)
return self.samples[sample_type]
else:
return None
def add_result(self, module_name, result): def add_result(self, module_name, result):
""" """
:param module_name: The name of the module (i.e. align, remove_duplicates, peak_call...) to add a result for. :param module_name: The name of the module (i.e. align, remove_duplicates, peak_call...) to add a result for.
:type module_name: string :type module_name: string
:param result: The result class containing necessary metadata information about the module output file. :param result: The result class containing necessary metadata information about the module output file.
:type result: :py:class:chipathlon.result.Result :type result: :py:class:`~chipathlon.result.Result`
Adds a result to the current run indexed by module_name. Adds a result to the current run indexed by module_name and logical
name. For example, if you add the final result from the align
module (align.bam). It will be indexed under
""" """
if module_name not in self.results: if module_name not in self.results:
self.results[module_name] = {} self.results[module_name] = {}
...@@ -155,9 +157,10 @@ class Run(object): ...@@ -155,9 +157,10 @@ class Run(object):
:type module_name: string :type module_name: string
:param logical_file_name: The unique file name as seen in the module yaml file. I.e. align.bam :param logical_file_name: The unique file name as seen in the module yaml file. I.e. align.bam
:type logical_file_name: string :type logical_file_name: string
:returns: A list of results, or an empty list if no results are found.
Returns a result based on module_name and logical_file_name. Returns all results that match the provided module and logical file
Returns None if no such result exists. name. If not results exist, get_results returns an empty list.
""" """
if module_name in self.results: if module_name in self.results:
if logical_file_name in self.results[module_name]: if logical_file_name in self.results[module_name]:
...@@ -171,11 +174,13 @@ class Run(object): ...@@ -171,11 +174,13 @@ class Run(object):
:param logical_file_name: The unique file name as seen in the module yaml file. I.e. align.bam :param logical_file_name: The unique file name as seen in the module yaml file. I.e. align.bam
:type logical_file_name: string :type logical_file_name: string
:param final_result: The final result of the module w/ markers :param final_result: The final re