Commit 3ab6fe71 authored by aknecht2's avatar aknecht2
Browse files

Added a function to remove unused files from gridfs.

parent 3f749a89
......@@ -10,6 +10,7 @@ import collections
import chipathlon.conf
from pprint import pprint
import hashlib
from chipathlon.utils import progress
class MongoDB(object):
......@@ -350,6 +351,42 @@ class MongoDB(object):
)
return (valid, msg, data)
def clean_gfs(self):
"""
This function finds all files stored in gridfs that are not currently
referenced by any result file and removes them.
A clean database is a happy database.
"""
cursor = self.db.results.aggregate([
{
"$group": {
"_id": 1,
"valid_ids": {"$push": "$gridfs_id"}
}
}
])
# Doc contains all our valid ids
id_doc = cursor.next()
# Find all fs.files documents
gfs_cursor = self.db.fs.files.find({
"_id": {
"$nin": id_doc["valid_ids"]
}
})
# Iterate through file, delete fs.chunks then fs.files
total_files = gfs_cursor.count()
print "Found %s unused gridfs files. Preparing to delete...." % (total_files,)
for i, fs_file in enumerate(gfs_cursor):
progress(i, total_files)
self.db.fs.chunks.remove({
"files_id": fs_file["_id"]
})
self.db.fs.files.remove({
"_id": fs_file["_id"]
})
progress(total_files, total_files)
return
def get_samples(self, experiment_accession, file_type):
"""
:param experiment_accession: Accession number of the experiment to grab samples from.
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment