Commit 3ab6fe71 authored by aknecht2's avatar aknecht2
Browse files

Added a function to remove unused files from gridfs.

parent 3f749a89
......@@ -10,6 +10,7 @@ import collections
import chipathlon.conf
from pprint import pprint
import hashlib
from chipathlon.utils import progress
class MongoDB(object):
......@@ -350,6 +351,42 @@ class MongoDB(object):
return (valid, msg, data)
def clean_gfs(self):
This function finds all files stored in gridfs that are not currently
referenced by any result file and removes them.
A clean database is a happy database.
cursor = self.db.results.aggregate([
"$group": {
"_id": 1,
"valid_ids": {"$push": "$gridfs_id"}
# Doc contains all our valid ids
id_doc =
# Find all fs.files documents
gfs_cursor = self.db.fs.files.find({
"_id": {
"$nin": id_doc["valid_ids"]
# Iterate through file, delete fs.chunks then fs.files
total_files = gfs_cursor.count()
print "Found %s unused gridfs files. Preparing to delete...." % (total_files,)
for i, fs_file in enumerate(gfs_cursor):
progress(i, total_files)
"files_id": fs_file["_id"]
"_id": fs_file["_id"]
progress(total_files, total_files)
def get_samples(self, experiment_accession, file_type):
:param experiment_accession: Accession number of the experiment to grab samples from.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment