#!/usr/bin/python ############################################################################### # You can modify the configuration below. but DON'T EDIT the config=""" line, # # nor the """ line at the end of the configuration! insert/delete/modify # # ONLY between those two lines. Else, you would break the config parser. # # # # 'hostname' can be left blank : the program will then use gethostname(). # # You can modify those values interactively by specifying "config" on the # # command line. # ############################################################################### config=""" hostname= mysql_host=localhost mysql_db=blastidx mysql_user=blastidx mysql_passwd=blastidx """ ############################################################################### # You should not need to modify anything below this. # ############################################################################### createdbcodetemplate=""" CREATE DATABASE %(db)s ; GRANT ALL PRIVILEGES ON %(db)s.* TO '%(user)s'@'%(mysql_access_from)s' IDENTIFIED BY '%(passwd)s' ; USE %(db)s ; CREATE TABLE conf ( hostname varchar(32) NOT NULL default '', label varchar(32) NOT NULL default '', value tinytext NOT NULL ) ; INSERT INTO conf (label,value) VALUES ('path','/tmp') ; CREATE TABLE logs ( hostname varchar(32) NOT NULL default '', begin DATETIME default NULL, end DATETIME default NULL ) ; INSERT INTO logs (begin,end) values ("2003-11-01 10:00:00","2003-11-01 10:00:10") ; CREATE TABLE files ( id int(11) NOT NULL auto_increment, hostname varchar(32) NOT NULL default '', name tinytext NOT NULL, path text NOT NULL, size bigint(20) NOT NULL default '-1', md5 char(32) default NULL, indextime DATETIME default NULL, updatetime DATETIME default NULL, fsctime int(11) default '0', fsmtime int(11) default '0', PRIMARY KEY (id), FULLTEXT KEY path (path), KEY size (size), KEY indextime (indextime), KEY updatetime (updatetime), KEY pathsize (path(32),size), KEY md5 (md5) ) ; CREATE TABLE dirs ( id int(11) NOT NULL auto_increment, hostname varchar(32) NOT NULL default '', name tinytext NOT NULL, path text NOT NULL, indextime DATETIME default NULL, updatetime DATETIME default NULL, PRIMARY KEY (id), FULLTEXT KEY path (path) ) ; INSERT INTO files (name,path,size,md5,indextime,updatetime) VALUES ('emptyfile','/tmp/emptyfile',0,'d41d8cd98f00b204e9800998ecf8427e',"2003-11-01 10:00:05",updatetime) ; INSERT INTO dirs (name,path,indextime,updatetime) VALUES ('tmp','/tmp',"2003-11-01 10:00:05",updatetime) ; """ import os,sys,socket,md5,stat ERROR,WARNING,INFO="ERROR","WARNING","INFO" hushmsg=[] def msg(level,message): if level not in hushmsg: sys.stderr.write("%s: %s\n"%(level,message)) if level==ERROR: sys.exit(1) try: import MySQLdb except: msg(ERROR,"No MySQL support in your Python installation. Bye.") # read built-in "configuration" mysqlconfig={} mysqlparams=["host","user","db","passwd"] for kv in filter(None,map(lambda s:s.strip(),config.split("\n"))): k,v=kv.split("=") k,v=k.strip(),v.strip() if k=="hostname": hostname=v continue if k in map(lambda s:"mysql_"+s,mysqlparams): mysqlconfig[k.split("_")[1]]=v continue msg(WARNING,"Unknown configuration parameter : %s"%kv) for k in mysqlparams: if not mysqlconfig.get(k,""): msg(WARNING,"MySQL parameter '%s' is not set. Database might fail."%k) # fallback to gethostname() if hostname is not set if not hostname: try: hostname=socket.gethostname() except: msg(WARNING,"could not determine hostname. Using localhost.") hostname="localhost" def initdb(): global cursor,currentcontentscondition try: cursor=MySQLdb.connect(**mysqlconfig).cursor() except: msg(ERROR,"Could not connect to database. Did you create it?") lastrun=query("select begin,end from logs where hostname=%s "+ "and end IS NOT NULL order by end desc limit 1",hostname) if lastrun: lastrun=lastrun[0] msg(INFO,"The date of the last DB update for %s is : %s"%( hostname,lastrun["end"])) currentcontentscondition=" (updatetime>='%s') "%lastrun["begin"] else: msg(WARNING,"The DB for %s was never sucessfully updated."%hostname) currentcontentscondition=" 1 " def query(q,t=()): try: cursor.execute(q,t) except Exception,e: msg(ERROR,"MySQL error while executing :\n%s\nError was:\n%s"%(q,e)) if not cursor.description: return keys=map(lambda x:x[0],cursor.description) return map(lambda r: dict(map(None,keys,r)),cursor.fetchall()) def makecreatedbcode(): print "The following settings will be used when creating the DB :" for k in mysqlparams: print "%s=%s"%(k,mysqlconfig[k]) print "If you don't like those settings, just interrupt now (Ctrl+C)." print "You can then change them by running 'blastidx config', or by" print "editing the blastidx script. If you are OK, please answer :" prompt="Should the MySQL DB be accessable from remote hosts [y/N] ? " answer=raw_input(prompt) if answer and answer.upper()[0]!="Y": answer="localhost" else: answer="%" templateparams=mysqlconfig.copy() templateparams["mysql_access_from"]=answer for param in mysqlparams: templateparams[param]=templateparams[param].replace(r"'",r"\'") createdbcode=createdbcodetemplate%templateparams return createdbcode def md5sum(f): try: f = file(f) md = md5.new() tmp = f.read(8192) while tmp: md.update(tmp) tmp = f.read(8192) f.close() return md.hexdigest() except: msg(WARNING,"Could not compute md5sum for %s (is it a file?)"%f) return 32*"-" def action_update(): initdb() now=query("select now()")[0].values()[0] query("insert into logs (hostname,begin) values (%s,%s)",(hostname,now)) paths=map(lambda h:h['value'], query("select value from conf "+ "where hostname=%s and label='path'",hostname)) for p in paths: for root,dirs,files in os.walk(p): for dirname in dirs: path=os.path.join(root,dirname) ids=query("select id from dirs where hostname=%s and path=%s order by indextime desc",(hostname,path)) if ids: msg(INFO,"updating %s"%path) query("update dirs set updatetime=now() where id=%s"% ids[0]["id"]) else: msg(INFO,"inserting %s"%path) query("insert into dirs (hostname,name,path,indextime,updatetime) values (%s,%s,%s,now(),now())",(hostname,dirname,path)) for filename in files: path=os.path.join(root,filename) if not os.access(path,os.R_OK): msg(WARNING,"can't access %s"%path) continue st = os.stat(path) size,mtime,ctime=st[6],st[8],st[9] ids=query("select id from files where hostname=%s and path=%s and size=%s order by indextime desc",(hostname,path,size)) if ids: msg(INFO,"updating %s"%path) query("update files set updatetime=now() where id=%s"% ids[0]["id"]) else: msg(INFO,"inserting %s"%path) query("insert into files (hostname,name,path,size,md5,indextime,updatetime,fsctime,fsmtime) values (%s,%s,%s,%s,%s,now(),now(),%s,%s)",(hostname,filename,path,size,md5sum(path),ctime,mtime)) query("update logs set end=now() where hostname=%s and begin=%s", (hostname,now)) def action_qupdate(): hushmsg.append(INFO) hushmsg.append(WARNING) action_update() def action_query(): if len(sys.argv)<3: msg(ERROR,"you must specify a query string on the command line.") _query(sys.argv[2]) def action_queryh(): if len(sys.argv)<3: msg(ERROR,"you must specify a query string on the command line.") _query(sys.argv[2],1) def _query(q,allhosts=None,extrasqlcondition="1"): initdb() qs=q.replace('*','%') qlimit=10 if allhosts: results=query("select distinct md5,hostname,path from files where "+ currentcontentscondition+" and "+extrasqlcondition+" "+ "and name like concat('%%',%s,'%%') limit %s", (qs,qlimit+1)) else: results=query("select distinct md5,path from files where "+ currentcontentscondition+" and "+extrasqlcondition+" "+ "and hostname=%s "+ "and name like concat('%%',%s,'%%') limit %s", (hostname,qs,qlimit+1)) if len(results)>qlimit: msg(INFO,"More results found ; showing only %d results."%qlimit) else: msg(INFO,"%d results found."%len(results)) for r in results[:qlimit]: if allhosts: print "%(md5)s | %(hostname)-20.20s | %(path)s"%r else: print "%(md5)s %(path)s"%r def action_help(): print "Please specify a valid action on command line :" actionslist=actions.keys() actionslist.sort() for action in actionslist: print "%-10s %s"%(action,actions[action][1]) print "If you don't know BlastIDX, you really should read the tutorials!" sys.exit(1) def action_showsql(): print makecreatedbcode() def action_createdb(): createdbcode=makecreatedbcode() msg(INFO,"SQL code was generated.") print "To create the database, I will need mysql administrator rights." print "If you don't want to trust me, you can run the code yourself" print "(using showsql instead of createdb, and piping that thru mysql)." sqlroot={} sqlroot["user"]=raw_input("MySQL admin name (probably 'root') : ") sqlroot["passwd"]=raw_input("MySQL admin password (will be shown!) : ") sqlroot["host"]=raw_input("MySQL host (just hit enter if localhost) : ") msg(INFO,"Connecting to MySQL...") try: curs=MySQLdb.connect(**sqlroot).cursor() except Exception,e: msg(ERROR,"Sorry, I could not connect to MySQL.\nERROR: "+e[1]) queries=filter(None,map(lambda q:q.strip(),createdbcode.split(";"))) # first query is CREATE DATABASE msg(INFO,"Creating blastidx database...") try: curs.execute(queries[0]) except Exception,e: msg(ERROR,"I could not create the database.\nERROR: "+e[1]) msg(INFO,"Creating blastidx tables...") for q in queries[1:]: try: curs.execute(q) except Exception,e: msg(ERROR,"There was a SQL error. Sorry.\nERROR: "+e[1]+ "\nERROR: leaving broken blastidx database.") msg(INFO,"Done!") def action_showpaths(): initdb() ps=query("select * from conf where label='path' and hostname=%s",hostname) msg(INFO,"%d path(s) configured."%len(ps)) for p in ps: print p["value"] def action_addpath(): initdb() if len(sys.argv)<3: msg(ERROR,"you must specify a path on the command line.") if query("select 1 from conf where hostname=%s and label=%s and value=%s", (hostname,"path",sys.argv[2])): msg(ERROR,"%s already exists in the configuration DB."%sys.argv[2]) if not os.access(sys.argv[2],os.F_OK): msg(WARNING,"%s does not exist. Adding anyway."%sys.argv[2]) else: if not stat.S_ISDIR(os.stat(sys.argv[2])[0]): msg(WARNING,"%s is not a directory. Adding anyway."%sys.argv[2]) query("insert into conf (hostname,label,value) values (%s,%s,%s)", (hostname,"path",sys.argv[2])) msg(INFO,"added path %s."%sys.argv[2]) def action_delpath(): initdb() if len(sys.argv)<3: msg(ERROR,"you must specify a path on the command line.") if not query("select 1 from conf where hostname=%s and label=%s and value=%s", (hostname,"path",sys.argv[2])): msg(ERROR,"%s does not exist in the configuration DB."%sys.argv[2]) query("delete from conf where hostname=%s and label=%s and value=%s", (hostname,"path",sys.argv[2])) msg(INFO,"dropped path %s."%sys.argv[2]) def action_config(): myname=sys.argv[0] myfd=open(myname) mydata=myfd.read() myfd.close() if mydata.find(config)<0: msg(ERROR,"Something really weird happened -\n"+ "I could not find config info in myself.") newconfig=[] msg(INFO,"You are about to change the configuration.") msg(INFO,"Your modifications will be saved at the end of the process.") msg(INFO,"You can abort with Ctrl+C at any time to cancel.") print "What hostname should I use in the index ?" print "(you can safely just hit ENTER ; I will then use gethostname())" newconfig.append("hostname=%s"%raw_input("hostname=")) def paraminput(paramname): print "(current value of mysql_%s is %s ; just press ENTER to keep it"%\ (paramname,mysqlconfig[paramname]) answer=raw_input("mysql_%s="%paramname) if not answer: answer=mysqlconfig[paramname] newconfig.append("mysql_%s=%s"%(paramname,answer)) paraminput("host") paraminput("db") paraminput("user") paraminput("passwd") newconfig="\n".join([""]+newconfig+[""]) msg(INFO,"Hit ENTER to save config, or anything else to cancel!") if raw_input("Yalla!"): msg(ERROR,"Configuration aborted.") try: newfd=open(myname+"-new","w") newfd.write(mydata.replace(config,newconfig)) newfd.close() os.rename(myname,myname+"-old") os.rename(myname+"-new",myname) os.chmod(myname,0755) except Exception,e: msg(ERROR,"I could not save configuration (%s)."%e) msg(INFO,"Configuration saved.") msg(INFO,"Old configuration was stored into %s."%(myname+"-old")) def action_finddupes(): initdb() fingerprints=query("select md5,size from files where hostname=%s "+ "and "+currentcontentscondition+ "group by md5,size having count(*)>1 "+ "order by size asc",hostname) msg(INFO,"Found %s fingerprint(s) with duplicates."%len(fingerprints)) for f in fingerprints: files=query("select path from files where hostname=%s "+ "and md5=%s and size=%s ",(hostname,f["md5"],f["size"])) for name in files: print "%s %12s %s"%(f["md5"],f["size"],name["path"]) def action_stats(): initdb() def count(x,y=()): return query("select count(*) from "+x,y)[0]["count(*)"] totalupdates=count("logs") failedupdates=count("logs where end IS NULL") okupdates=count("logs where end IS NOT NULL") print "The DB was updated %s times (%s failures, %s successes)."%( totalupdates,failedupdates,okupdates) totalupdates=count("logs where hostname=%s",hostname) failedupdates=count("logs where hostname=%s and end IS NULL",hostname) okupdates=count("logs where hostname=%s and end IS NOT NULL",hostname) print "For this host (%s), I see %s updates (%s failures, %s successes)."%( hostname,totalupdates,failedupdates,okupdates) totalfiles=count("files") myfiles=count("files where hostname=%s",hostname) currentfiles=count("files where "+currentcontentscondition) mycurrentfiles=count("files where hostname=%s and "+ currentcontentscondition,hostname) print "The DB has %s files. %s records are up-to-date."%( totalfiles,currentfiles) print "For this host (%s), there are %s records. %s are up-to-date."%( hostname,myfiles,mycurrentfiles) print "Date of the most recently indexed file : "+str(query( "select max(indextime) from files")[0]["max(indextime)"]) print "For this host (%s) : %s"%( hostname, str(query("select max(indextime) from files where hostname=%s", hostname)[0]["max(indextime)"])) def action_last(): initdb() if len(sys.argv)<3: count=10 else: try: count=int(sys.argv[2]) except: msg(ERROR,"Invalid number : "+sys.argv[2]) msg(INFO,"Displaying up to %d last indexed entries."%count) for h in query("select md5,name,indextime from files where hostname=%s "+ "and "+currentcontentscondition+" order by indextime desc "+ "limit %s",(hostname,count)): print "%(md5)s | %(indextime)s | %(name)s"%h def action_missing(): initdb() r=query("select f1.hostname,f1.md5,f1.name from "+ "files as f1 left join files as f2 on f1.md5=f2.md5 "+ "and f1.hostname!=%s and f2.hostname=%s "+ "where f1.hostname!=%s and f2.hostname is null",3*(hostname,)) msg(INFO,"Found %d records that you might want to mirror."%len(r)) for h in r: print "%(md5)s | %(hostname)-20.20s | %(name)s"%h importexportattrs=["hostname","name","path","size","md5", "indextime","updatetime","fsctime","fsmtime"] importexportattrsstring=",".join(importexportattrs) importexportmappings=",".join(map(lambda x:"%%(%s)s"%x,importexportattrs)) def action_export(): initdb() r=query("select "+importexportattrsstring+" from files "+ "where hostname=%s and "+currentcontentscondition,hostname) msg(INFO,"Exporting %d records to stdout."%len(r)) print repr(importexportattrs) for h in r: print repr(map(lambda k,h=h: str(h[k]),importexportattrs)) def action_import(): initdb() msg(INFO,"Importing data from stdin. You can abort with Ctrl+C.") attrs=input() for a in importexportattrs: if a not in attrs: msg(ERROR,"Missing attribute : %s ; aborting."%a) msg(INFO,"Header was valid. Good, keep going.") count=0 while 1: try: values=input() h=dict(zip(attrs,values)) if h["hostname"]==hostname: msg(WARNING,"Refusing to import a record coming from myself.") else: msg(INFO,"Importing from %(hostname)s : %(path)s"%h) query("insert into files ("+importexportattrsstring+") "+ "values ("+importexportmappings+")",h) count+=1 except EOFError: break msg(INFO,"Successfully imported %d records."%count) def action_clear(): if len(sys.argv)<3: msg(ERROR,"Please specify the hostname whose files should be deleted.") wipeout=sys.argv[2] if wipeout==hostname: msg(WARNING,"You are trying to delete your own index.") msg(WARNING,"I won't do that: you probably want to 'cleanup' instead.") msg(ERROR,"Cowardly aborting.") initdb() query("delete from files where hostname=%s",wipeout) msg(INFO,"Dropped all records for hostname %s."%repr(wipeout)) def action_cleanup(): initdb() query("delete from files where hostname=%s "+ "and not "+currentcontentscondition,hostname) msg(INFO,"Dropped all obsolete records related to this host.") def action_md5check(): initdb() files=query("select path,md5 from files where hostname=%s and "+ currentcontentscondition+ "order by path",hostname) msg(INFO,"%d files to check."%len(files)) badlist=[] for f in files: msg(INFO,"Checking: %s"%f["path"]) if f["md5"]!=md5sum(f["path"]): msg(WARNING,"Bad md5: %s"%f["path"]) badlist.append(f["path"]) msg(INFO,"%d/%d md5sums were invalid."%(len(badlist),len(files))) for path in badlist: msg(WARNING,path) msg(INFO,"md5check done.") cgienvvars=["QUERY_STRING","SCRIPT_NAME","REQUEST_URI"] def action_genphpstub(): sys.stdout.write(' passthru(') for var in cgienvvars: sys.stdout.write('"PHP_%s=".escapeshellarg($%s)." ".'%(var,var)) sys.stdout.write('"%s"); ?>'%os.path.abspath(sys.argv[0])) sys.stdout.write("\n") def action_cgi(): print "Content-type: text/html" print "" _cgi() def action_php(): for var in cgienvvars: os.environ[var]=os.environ["PHP_"+var] _cgi() import urllib def decode_qs(qs): h={} for kv in qs.split('&'): if kv.find("=")<0: h["glob"]=urllib.unquote_plus(kv) else: k,v=map(urllib.unquote,kv.split("=",1)) h[k]=v return (h.get("glob",None),h.get("sql","1")) def decode_uri(uri): if uri[0]!='/': msg(ERROR,"Partial URI doesn't start with /.") components=map(urllib.unquote_plus,uri[1:].split("/")) glob=components[0] sql=" and ".join(components[1:]) if not sql: sql="1" return (glob,sql) def _cgi(): sys.stderr=sys.stdout qs=os.environ["QUERY_STRING"] uri=os.environ["REQUEST_URI"] sname=os.environ["SCRIPT_NAME"] """print "
%s\n%s\n%s\n"%(qs,uri,sname)""" glob=None if qs: # request thru form glob,extrasql=decode_qs(qs) else: if uri!=sname: # request embedded in path glob,extrasql=decode_uri(uri[len(sname):]) if glob: # there was a request, execute it print "
" _query(glob,extrasqlcondition=extrasql) print "" else: # display default form print """ BlastIDX query for host %(hostname)s: """%{"hostname":hostname,"sname":sname} sys.exit(0) def action_tutorial1(): print """ Quick introduction to BlastIDX: (you can print this for further reference) 1. You need a MySQL server. Go install one if necessary. 2. Run "%(blastidx)s config" if the database runs on another host, or if you want to change the default user/password/database name used by BlastIDX. (This should not be needed, really.) 3. If you know the "root" password for your MySQL databse, you can run "%(blastidx)s createdb" (it will prompt the MySQL administrator user and password). If you don't have administrator rights, or if you don't trust this program, you can run "%(blastidx)s showsql" : it will dump the SQL code that you need to execute to create the database structure. 4. Now, you can run "%(blastidx)s addpath /mnt/space/new_mp3s" (for instance), and then, "%(blastidx)s update". It will index the directory! Great! But don't add a big directories yet : the md5 hash computation takes time. 5. Now that you have some real index, try "%(blastidx)s query foo*bar". It should show the md5 hash and the full path of all files named *foo*bar*. 6. You can also run "%(blastidx)s finddupes" to check for duplicate files. 7. Add some more paths (you can use showpaths, addpath, delpath). 8. Arrange a crontab job to be run periodically ; but in your cron job, don't execute "%(blastidx)s update" : use "%(blastidx)s qupdate" instead. It is a quiet update, and will only output anything if there are fatal errors (it won't spit out the list of files). Congratulation ! You completed the basic tutorial."""%{ "blastidx":os.path.basename(sys.argv[0])} def action_tutorial2(): print """ BlastIDX web front-end configuration: 1. You need a working BlastIDX setup (see tutorial1 if needed). 2. You need to be able to use CGI or PHP. 3a. If you use CGI, just copy %(blastidx)s to your CGI directory, for instance /usr/lib/cgi-bin. 3b. If you use PHP, generate the PHP stub by running : %(blastidx)s genphpstub > %(blastidx)s.php You can rename the PHP file as you want. Move it in some web directory. This PHP file is very simple, and you can embed it easily in a web page if you want. You don't need to modify it if you reconfigure BlastIDX - only if you move %(blastidx)s. 4. Now, go to the URL of your CGI or PHP script. It should work."""%{ "blastidx":os.path.basename(sys.argv[0])} def action_tutorial3(): print """ BlastIDX allows you to export your index as a text file, and import indexes of other people. However, to simplify procedures, there is no support (yet) for partial index export (for instance, to import only the records that were created or updated during the last run). 1. Go to host X, and run "%(blastidx)s export > X.index". 2. Move the file "X.index" to host Y. NOTE: it is a text file, and you can compress it very efficiently. 3. On host Y, run "%(blastidx)s clear X" (X should be the hostname of host X ; you can see it running "head -2 X.index | cut -f2 -d\'", on the second line) 4. Now, on host Y, run "%(blastidx)s import < X.index". 5. Try to query your combined index, but use "%(blastidx)s queryh foo*bar" instead of just "query" to search the whole index. 6. You can also use "%(blastidx)s missing" to show the files that you don't have, but which exist on other hosts (the md5 hash will be compared, so don't worry about different file names)."""%{ "blastidx":os.path.basename(sys.argv[0])} actions= { "update":(action_update,"Scan filesystem to update database index"), "qupdate":(action_qupdate,"Quietly scans filesystem to update DB"), "query":(action_query,"Query index. Example : query ghost*shell"), "queryh":(action_queryh,"Query index of all hosts in DB."), "createdb":(action_createdb,"Create index database"), "showsql":(action_showsql,"Show SQL code to create database"), "showpaths":(action_showpaths,"Show directories to be indexed"), "addpath":(action_addpath,"Add a path to index. Ex: addpath /home"), "delpath":(action_delpath,"Remove path. Ex: delpath /tmp"), "help":(action_help,"Shows this help"), "stats":(action_stats,"Show some statistics"), "missing":(action_missing,"List missing files of other hosts"), "config":(action_config,"Change configuration interactively"), "finddupes":(action_finddupes,"Find duplicate files"), "cgi":(action_cgi,"Behave as a CGI script (for debugging)"), "php":(action_php,"Do PHP processing (CGI with PHP_QUERY_STRING)"), "genphpstub":(action_genphpstub,"Generate PHP script"), "last":(action_last,"Display most recent contents"), "export":(action_export,"Dump a list of this host's current index"), "import":(action_import,"Import (from stdin) an exported index"), "clear":(action_clear,"Delete records from the specified host"), "cleanup":(action_cleanup,"Delete obsolete records"), "md5check":(action_md5check,"Check file integrity"), "tutorial1":(action_tutorial1,"Basic tutorial (READ THIS FIRST!)"), "tutorial2":(action_tutorial2,"Extended tutorial (web support)"), "tutorial3":(action_tutorial3,"Extended tutorial (import/export)") } if os.environ.has_key("QUERY_STRING"): action_cgi() if os.environ.has_key("PHP_QUERY_STRING"): action_php() if len(sys.argv)<2: action_help() actions.get(sys.argv[1],(action_help,""))[0]()