#!/usr/bin/env sampy """ An example script showing how to find all of the 'raw' ancestors (ancestors with dataTier='raw') of a particular file. This script does NOT use the DATA_FILES_RAW table, which is not yet populated accurately. The DATA_FILES_RAW table, when appropriately populated, will render this script obsolete except as an historical example. Usage: CLI: ./GetRawAncestors.py fileName API: #!/usr/bin/env sampy # make sure your PYTHONPATH includes $SAM_DIR/examples from GetRawAncestors import getRawAncestors g = getRawAncestors() physicalFileAttributeList = g(args='fileName') """ import os import sys import string import SAM from Sam import sam from SamUtility.DbDerivedClient import DbDerivedClient from SamStruct.DataFilePhysicalAttributes import DataFilePhysicalAttributes from SamStruct.DataFilePhysicalAttributesList import DataFilePhysicalAttributesList from SamException import SamExceptions from SamUserApi_CommandInterface import SamUserApi_CommandInterface class getRawAncestors(SamUserApi_CommandInterface): def __init__(self): SamUserApi_CommandInterface.__init__( self, commandParameters = { 'verb' : ['get raw ancestors'], 'allowedArgCount' : 1, 'apiReturns' : DataFilePhysicalAttributesList, 'description' : {'args' : 'filename for which you which to see the raw ancestors', }, 'helpText' : """ Get a list of the raw ancestors of a given file (that is, the original ancestors in dataTier 'raw'). """, }) def cliProcessResults(self, listOfFiles): if listOfFiles: listOfFileNames = map(lambda(x): x.getFileName(), listOfFiles) stringifiedListOfFiles = string.join(listOfFileNames, '\n') else: stringifiedListOfFiles = "No raw ancestors found." return SamUserApi_CommandInterface.cliProcessResults(self, stringifiedListOfFiles) def implementation(self, argDict, argList): self.db = DbDerivedClient('DbDataFiles') childFile = argList[0] # get the childFileId try: childFileId = self.db.getOne(['fileId'], {'fileName':childFile}) except SamExceptions.DbMinRowsException: raise SamExceptions.DataFileNotFound("No such file: '%s'" % childFile) # all parents of all generations: allGenerations = [] # loop parents = self._getParentsOf(childFileId) allGenerations += parents while parents != []: new_parents = [] for fileId in parents: child = fileId parents = self._getParentsOf(child) allGenerations += parents new_parents += parents parents = new_parents # ok, now we have a list of allGenerations: returnList = DataFilePhysicalAttributesList() if allGenerations: allGenerationsFileIdList = string.join(map(lambda(x): "%s" % x, allGenerations), ', ') # now convert to datafilePhysicalAttributesList, filtering with dataTier raw theQuery = """ select distinct df.file_id, df.file_name, df.file_size_in_bytes, df.crc_value, crc.crc_type_desc, df.event_count from data_files df, crc_types crc, data_tiers dt where df.file_id in (%s) and df.crc_type_id = crc.crc_type_id and df.data_tier_id = dt.data_tier_id and dt.data_tier = 'raw' """ % allGenerationsFileIdList rows = self.db.query(theQuery) if rows and rows[0]: returnList = map(lambda(x): DataFilePhysicalAttributes(fileId=x[0], fileName=x[1], fileSize="%sB" % x[2], crcValue=x[3], crcType=x[4], eventCount=x[5]), rows) return returnList def _getParentsOf(self, fileId): """ Get all direct parents of the specified fileId """ returnList = [] theQuery = """ select fl.file_id_source from file_lineages fl where fl.file_id_dest = %s """ % fileId rows = self.db.query(theQuery) if rows and rows[0]: returnList = map(lambda(x): x[0], rows) return returnList def main(args): return getRawAncestors().dispatch(args) if __name__ == "__main__": sys.exit(main(sys.argv[1:]))