sync_iasdfus_deleted.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

import sys
import urllib
import re
from photoblaster.db.models import Iasdfus
from photoblaster.db.models import ImCmd


def super_unquote(s):
    for i in xrange(0,20):
        s = urllib.unquote(s)
    return s

#searches for elements in the Iasdfus table that have deleted=1
#stores all the objects as a list in memory, there are 92,000
deleted_urls = Iasdfus.search(deleted=True).all()
#print len(deleted_urls)
#well i'm reading doc on it, they recommend create one session per all requests, and we here are creating new session per 
#each request, not sure if that matters I guess it does.
#so in other words, in the flask-sqlalchemy pattern the session is created when the server is started
#instantiate pb.db.Database()
#and at the end of the script
#database.close(), something like this? close() can be in destroy method for dataabase, it will go out of scope at the end of script

#do python classes have a default destroy method, like a hook that gets called when they are cleared? __del__ i think
#ok I can work on that


#address is a field in Iasdfus (this data is coming from the s3logs)
#im/ff/wigglelogo_1347403794_frankhats_1347403811_frankhats.gif
n = 0
for url in deleted_urls:
#    print "from iasdfus: (%s)" % (url.address)
    #iterates through
    try: 
        parts = url.address.split("/")
        #rips out "dir" and "newfile" column values from Iasdfus to used
        #to search a different table, ImCmd
        dirpart = parts[1]
        newfile = super_unquote(parts[2])
        newfile_parts = re.split(r'\+?http', newfile)
        newfile = newfile_parts[0]
#        print "parts from iasdfus (newfile : %s, dir: %s)" % (newfile, dirpart)
    except IndexError:
        continue
    try:
        #searches ImCmd for that row, where dirpart and newfile are the same as the 
        #values in the Iasdfus row
        #k so looks like it's stuck here regardless on item, so this probably session limit, or something like that
        query = ImCmd.search(**{"dir": dirpart, "newfile": newfile})
#        print "imcmd done query n: %d\n" % (n)
        n +=1
        #can't find anything about it me too 
        #so it doesn't have .free(), I think it doesni so it's some sort of session parameter? looks so seen it anywhere? have free only when it doesn't find anything it's still a Query objects
        #could be that when there's this rollback error, it has a timeout of 30 seconds that it waits before proceeding
        #this whole thing is so weird
        #it seems like my theory might be right...seems like it to you right? not sure yet
        #processes the query
        matching_url = query.first()
#        print "got first\n"
        if not matching_url:
            print "imcmd: %s %s\n"  % (dirpart, newfile)
            continue
        if matching_url.deleted == 1:
#            print "is deleted\n"
            continue
        #update matching_url
        matching_url.update(deleted=1)
        print "update done\n"
    except AttributeError:
        raise
        #honestly can't imagine why I'm getting this error maybe we just need to implement another rollback exception?
        #not syre yet. why is it slow? how  long ImCMD take s time? yeah it's huge, over 600,000 fields
        #probably needs an index on dir + newfile as that is unique right index will be nice