#!/usr/bin/env sampy """ Script to print out the descendants of the files in a particular dataset, filtered to a particular dataTier and appVersion. Usage: CLI: ./GetProjectDescendants.py --definitionName= --appVersion= --dataTier=
API: #!/usr/bin/env sampy # make sure your PYTHONPATH includes $SAM_DIR/examples from GetProjectDescendants import getProjectDescendants gpd = getProjectDescendants() theList = gpd(definitionName='defName', appVersion='version', dataTier='dt') Where: definitionName = dataset definition name containing the input files (raw data) dataTier = the datatier of the output files appVersion = the appVersion producing the output files """ import sys import string import os import SAM from Sam import sam from SamUtility.DbDerivedClient import DbDerivedClient from SamStruct.SamTime import SamTime from SamStruct.DataFilePhysicalAttributes import DataFilePhysicalAttributes from SamStruct.DataFilePhysicalAttributesList import DataFilePhysicalAttributesList from SamException import SamExceptions from SamUserApi_CommandInterface import SamUserApi_CommandInterface # we will use the $SAM_DIR/examples/GetFileDescendants.py and GetRawAncestors.py scripts, # so make sure that $SAM_DIR/examples is in our $PYTHONPATH: exampleDir = os.path.expandvars("${SAM_DIR}/examples") sys.path.append(exampleDir) try: from GetFileDescendants import getFileDescendants from GetRawAncestors import getRawAncestors except ImportError: raise sys.exit("No GetFileDescendants.getFileDescendants module found in this release of sam, cannot proceed.") ##################################################################################################### class getProjectDescendants(SamUserApi_CommandInterface): def __init__(self): SamUserApi_CommandInterface.__init__( self, commandParameters={ 'verb' : ['get project accounting'], 'requiredOptions' : ['%s=' % SAM.attrDefinitionName, '%s=' % SAM.attrAppVersion, '%s=' % SAM.attrDataTier], 'description' : {SAM.attrDefinitionName : 'the dataset definition that was the basis for the list of raw files', SAM.attrAppVersion : 'the appVersion of interest in the reconstructed files', SAM.attrDataTier : 'the dataTier of interest to account for', }, 'synonymousOptions' : {SAM.attrDefinitionName : ['defName']}, 'helpText' : """ Script to assist Mike in accounting for reconstruction at various facilities. """, }) def implementation(self, argDict, argList): defName = argDict.get(SAM.attrDefinitionName) version = argDict.get(SAM.attrAppVersion) dt = argDict.get(SAM.attrDataTier) # internal function to sum up the event counts: def sum(x,y): return x + y # get the original list of files in the definition datasetFiles = sam.translateConstraints(dimensions="__set__ %s" % defName) try: datasetEventCount = reduce(sum, map(lambda(x): x.getEventCount(), datasetFiles)) except TypeError: # no events datasetEventCount = 0 print("There were %s files (%s events) in the original dataset with defName='%s'." % (len(datasetFiles), datasetEventCount, defName)) # now find all of the children in the specified dataTier: gd = getFileDescendants() allDescendants = DataFilePhysicalAttributesList() for f in datasetFiles: descendants = gd(args=f.getFileName(), dataTier=dt, appVersion=version) for d in descendants: if d not in allDescendants: allDescendants.append(d) try: descendantEventCount = reduce(sum, map(lambda(x): x.getEventCount(), allDescendants)) except TypeError: # empty list descendantEventCount = 0 # and go back UP the chain to find the raw parents of the children # in the specified data tier: ga = getRawAncestors() rawAncestors = [] for f in allDescendants: ancestors = ga(args=f.getFileName()) for a in ancestors: if a not in rawAncestors: rawAncestors.append(a) print("There are %s files (%s events) in dataTier='%s' with appVersion='%s'." % (len(allDescendants), descendantEventCount, dt, version)) if len(datasetFiles) == len(rawAncestors): print("Looks like all input files are accounted for.") else: print("Input files not accounted for in dataTier='%s', appVersion='%s':" % (dt, version)) ancestralFiles = map(lambda(x): x.getFileName(), rawAncestors) for f in datasetFiles: if f.getFileName() not in ancestralFiles: print("\t%s (%s events)" % (f.getFileName(), f.getEventCount())) ##################################################################################################### def main(args): return getProjectDescendants().dispatch(args) if __name__ == "__main__": sys.exit(main(sys.argv[1:]))