diff --git a/JavaExtractor/extract.py b/JavaExtractor/extract.py index 55ca9ef..acb3ad8 100644 --- a/JavaExtractor/extract.py +++ b/JavaExtractor/extract.py @@ -3,78 +3,63 @@ import itertools import multiprocessing import os -import sys import shutil import subprocess from threading import Timer -import sys from argparse import ArgumentParser -from subprocess import Popen, PIPE, STDOUT, call - - def get_immediate_subdirectories(a_dir): return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir) if os.path.isdir(os.path.join(a_dir, name))] +def ParallelExtractDir(args, tmpdir, dir_): + ExtractFeaturesForDir(args, tmpdir, dir_, "") -TMP_DIR = "" - -def ParallelExtractDir(args, dir): - ExtractFeaturesForDir(args, dir, "") - - -def ExtractFeaturesForDir(args, dir, prefix): +def ExtractFeaturesForDir(args, tmpdir, dir_, prefix): command = ['java', '-cp', args.jar, 'JavaExtractor.App', '--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width), - '--dir', dir, '--num_threads', str(args.num_threads)] - - # print command - # os.system(command) + '--dir', dir_, '--num_threads', str(args.num_threads)] kill = lambda process: process.kill() - outputFileName = TMP_DIR + prefix + dir.split('/')[-1] + outputFileName = tmpdir + prefix + dir_.split('/')[-1] failed = False with open(outputFileName, 'a') as outputFile: - sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE) + sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE,) timer = Timer(600000, kill, [sleeper]) - try: timer.start() stdout, stderr = sleeper.communicate() finally: timer.cancel() - if sleeper.poll() == 0: - if len(stderr) > 0: - print(sys.stderr, stderr, file=sys.stdout) - else: - print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time', file=sys.stdout) + if sleeper.poll() != 0: failed = True - subdirs = get_immediate_subdirectories(dir) + subdirs = get_immediate_subdirectories(dir_) for subdir in subdirs: - ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_') - if failed: - if os.path.exists(outputFileName): - os.remove(outputFileName) - + ExtractFeaturesForDir(args, subdir, prefix + dir_.split('/')[-1] + '_') + + if failed and os.path.exists(outputFileName): + os.remove(outputFileName) def ExtractFeaturesForDirsList(args, dirs): - global TMP_DIR - TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid()) - if os.path.exists(TMP_DIR): - shutil.rmtree(TMP_DIR, ignore_errors=True) - os.makedirs(TMP_DIR) - try: - p = multiprocessing.Pool(4) - p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs)) - #for dir in dirs: - # ExtractFeaturesForDir(args, dir, '') - output_files = os.listdir(TMP_DIR) - for f in output_files: - os.system("cat %s/%s" % (TMP_DIR, f)) - finally: - shutil.rmtree(TMP_DIR, ignore_errors=True) + tmp_dir = f"./tmp/feature_extractor{os.getpid()}/" + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir, ignore_errors=True) + os.makedirs(tmp_dir) + + for i in range(0, len(dirs), args.batch_size): + batch_dirs = dirs[i:i + args.batch_size] + timeout_seconds = 60 # timeout setting + try: + with multiprocessing.Pool(4) as p: + result = p.starmap_async(ParallelExtractDir, zip(itertools.repeat(args), itertools.repeat(tmp_dir), batch_dirs)) + result.get(timeout=timeout_seconds) + except multiprocessing.TimeoutError: + continue + output_files = os.listdir(tmp_dir) + for f in output_files: + os.system("cat %s/%s" % (tmp_dir, f)) + os.remove(os.path.join(tmp_dir, f)) if __name__ == '__main__': parser = ArgumentParser() @@ -84,6 +69,9 @@ def ExtractFeaturesForDirsList(args, dirs): parser.add_argument("-j", "--jar", dest="jar", required=True) parser.add_argument("-dir", "--dir", dest="dir", required=False) parser.add_argument("-file", "--file", dest="file", required=False) + # add a new parameter batch_size + parser.add_argument("-batch_size", "--batch_size", dest="batch_size", required=False, default=3, type=int) + args = parser.parse_args() if args.file is not None: @@ -92,9 +80,5 @@ def ExtractFeaturesForDirsList(args, dirs): os.system(command) elif args.dir is not None: subdirs = get_immediate_subdirectories(args.dir) - to_extract = subdirs - if len(subdirs) == 0: - to_extract = [args.dir.rstrip('/')] + to_extract = subdirs if subdirs else [args.dir.rstrip('/')] ExtractFeaturesForDirsList(args, to_extract) - -