summaryrefslogtreecommitdiff
path: root/scripts/s3upload.py
blob: f4a5a771b09ac37142cb26678cc832a3558c4949 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import datetime
import mimetypes
import os
import sys
import time
import S3

CONN = None
AWS_ACCESS_KEY_ID = 'AKIAJAQK4CDDP6I6SNVA'
AWS_SECRET_ACCESS_KEY = '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'
BUCKET_NAME = 'dumpfm'

def parse_date(date_string, fmt='%Y%m%d'):
    return datetime.datetime.strptime(date_string, fmt)

def retry_func(f, count):
    try:
        f()
    except:
        if count <= 1: raise
        else:
            print 'Error! retrying %s more time(s)' % (count - 1)
            retry_func(f, count - 1)

def upload_file(path, dry_run=True):
    path = os.path.normpath(path)
    if path == '.' or not os.path.isfile(path):
        return
    filedata = open(path, 'rb').read()
    content_type = mimetypes.guess_type(path)[0]
    if not content_type:
        content_type = 'text/plain'

    path = path.replace('\\', '/') # Windows hack
    if not dry_run:
        start = time.time()
        def do_upload():
            CONN.put(BUCKET_NAME, path, S3.S3Object(filedata),
                     {'x-amz-acl': 'public-read', 'Content-Type': content_type})
        retry_func(do_upload, 3)
        ms_took = (time.time() - start) * 1000
        print "uploaded %s (%0.0fms)" % (path, ms_took)


def do_upload(directory, start_date, end_date, dry_run=True):
    global CONN
    CONN = S3.AWSAuthConnection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

    for subdir in sorted(os.listdir(directory)):
        subdir_date = None
        try:
            subdir_date = parse_date(subdir)
        except:
            continue

        
        if start_date <= subdir_date <= end_date:
            counter = 0
            print "uploading contents of %s" % subdir, 
            for filename in os.listdir(os.path.join(directory, subdir)):
                path = os.path.join(directory, subdir, filename)
                upload_file(path, dry_run=dry_run)
                counter += 1

            print 'handled %s files' % counter

if __name__ == "__main__":
    if not 4 <= len(sys.argv) <= 5:
        print 'usage: s3upload.py directory startdate enddate [dryrun=true]'
        sys.exit(1)
    
    
    directory = sys.argv[1]
    start_date = sys.argv[2]
    end_date = sys.argv[3]
    dry_run = sys.argv[4] if len(sys.argv) == 5 else 'true'

    if dry_run.lower() == 'true':
        print 'doing dry run'
        dry_run = True
    else:
        dry_run = False

    try:
        start_date = parse_date(start_date)
    except:
        print "invalid start date: %s" % start_date
        sys.exit(1)

    try:
        end_date = parse_date(end_date)
    except:
        print "invalid end date: %s" % end_date
        sys.exit(1)

    do_upload(directory, start_date, end_date, dry_run)