From 0a3fd5b62065333669c7b391c626cb2505217617 Mon Sep 17 00:00:00 2001 From: Matt Cooper Date: Fri, 12 Aug 2016 16:48:46 -0400 Subject: First commit --- Code/process_data.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 Code/process_data.py (limited to 'Code/process_data.py') diff --git a/Code/process_data.py b/Code/process_data.py new file mode 100644 index 0000000..170959a --- /dev/null +++ b/Code/process_data.py @@ -0,0 +1,71 @@ +import numpy as np +import getopt +import sys +from glob import glob + +import constants as c +from utils import process_clip + + +def process_training_data(num_clips): + """ + Processes random training clips from the full training data. Saves to TRAIN_DIR_CLIPS by + default. + + @param num_clips: The number of clips to process. Default = 5000000 (set in __main__). + + @warning: This can take a couple of hours to complete with large numbers of clips. + """ + num_prev_clips = len(glob(c.TRAIN_DIR_CLIPS + '*')) + + for clip_num in xrange(num_prev_clips, num_clips + num_prev_clips): + clip = process_clip() + + np.savez_compressed(c.TRAIN_DIR_CLIPS + str(clip_num), clip) + + if (clip_num + 1) % 100 == 0: print 'Processed %d clips' % (clip_num + 1) + + +def usage(): + print 'Options:' + print '-n/--num_clips= <# clips to process for training>' + print '-t/--train_dir= ' + print '-c/--clips_dir= ' + print " (I suggest making this a hidden dir so the filesystem doesn't freeze" + print " with so many files. DON'T `ls` THIS DIRECTORY!)" + print '-o/--overwrite (Overwrites the previous data in the training dir)' + + +def main(): + ## + # Handle command line input + ## + + num_clips = 5000000 + + try: + opts, _ = getopt.getopt(sys.argv[1:], 'n:t:c:o', + ['num_clips=', 'train_dir=', 'clips_dir=', 'overwrite']) + except getopt.GetoptError: + usage() + sys.exit(2) + + for opt, arg in opts: + if opt in ('-n', '--num_clips'): + num_clips = int(arg) + if opt in ('-t', '--train_dir'): + c.TRAIN_DIR = c.get_dir(arg) + if opt in ('-c', '--clips_dir'): + c.TRAIN_DIR_CLIPS = c.get_dir(arg) + if opt in ('-o', '--overwrite'): + c.clear_dir(c.TRAIN_DIR_CLIPS) + + ## + # Process data for training + ## + + process_training_data(num_clips) + + +if __name__ == '__main__': + main() -- cgit v1.2.3-70-g09d2