From 9b824d6a1a832d3362f686b1848ab4117b2f268a Mon Sep 17 00:00:00 2001 From: Torge Date: Fri, 5 Feb 2021 12:11:19 +0100 Subject: [PATCH 1/3] TMP_DIR caused issues by not being set correctly in function `ExtractFeaturesForDirsList`, removed shaddowing of python3 core `dir` --- JavaExtractor/extract.py | 41 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/JavaExtractor/extract.py b/JavaExtractor/extract.py index 55ca9ef..7053f9f 100644 --- a/JavaExtractor/extract.py +++ b/JavaExtractor/extract.py @@ -11,6 +11,8 @@ from argparse import ArgumentParser from subprocess import Popen, PIPE, STDOUT, call +from pprint import pprint + def get_immediate_subdirectories(a_dir): @@ -18,26 +20,22 @@ def get_immediate_subdirectories(a_dir): if os.path.isdir(os.path.join(a_dir, name))] -TMP_DIR = "" - -def ParallelExtractDir(args, dir): - ExtractFeaturesForDir(args, dir, "") +def ParallelExtractDir(args, tmpdir, dir_): + ExtractFeaturesForDir(args,tmpdir, dir_, "") -def ExtractFeaturesForDir(args, dir, prefix): +def ExtractFeaturesForDir(args, tmpdir, dir_, prefix): command = ['java', '-cp', args.jar, 'JavaExtractor.App', '--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width), - '--dir', dir, '--num_threads', str(args.num_threads)] - + '--dir', dir_, '--num_threads', str(args.num_threads)] # print command # os.system(command) kill = lambda process: process.kill() - outputFileName = TMP_DIR + prefix + dir.split('/')[-1] + outputFileName = tmpdir + prefix + dir_.split('/')[-1] failed = False with open(outputFileName, 'a') as outputFile: - sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE) + sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE,) timer = Timer(600000, kill, [sleeper]) - try: timer.start() stdout, stderr = sleeper.communicate() @@ -48,32 +46,31 @@ def ExtractFeaturesForDir(args, dir, prefix): if len(stderr) > 0: print(sys.stderr, stderr, file=sys.stdout) else: - print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time', file=sys.stdout) + print(sys.stderr, 'dir: ' + str(dir_) + ' was not completed in time', file=sys.stdout, flush=True) failed = True - subdirs = get_immediate_subdirectories(dir) + subdirs = get_immediate_subdirectories(dir_) for subdir in subdirs: - ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_') + ExtractFeaturesForDir(args, subdir, prefix + dir_.split('/')[-1] + '_') if failed: if os.path.exists(outputFileName): os.remove(outputFileName) def ExtractFeaturesForDirsList(args, dirs): - global TMP_DIR - TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid()) - if os.path.exists(TMP_DIR): - shutil.rmtree(TMP_DIR, ignore_errors=True) - os.makedirs(TMP_DIR) + tmp_dir = f"./tmp/feature_extractor{os.getpid()}/" + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir, ignore_errors=True) + os.makedirs(tmp_dir) try: p = multiprocessing.Pool(4) - p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs)) + p.starmap(ParallelExtractDir, zip(itertools.repeat(args),itertools.repeat(tmp_dir), dirs)) #for dir in dirs: # ExtractFeaturesForDir(args, dir, '') - output_files = os.listdir(TMP_DIR) + output_files = os.listdir(tmp_dir) for f in output_files: - os.system("cat %s/%s" % (TMP_DIR, f)) + os.system("cat %s/%s" % (tmp_dir, f)) finally: - shutil.rmtree(TMP_DIR, ignore_errors=True) + shutil.rmtree(tmp_dir, ignore_errors=True) if __name__ == '__main__': From 04d832b01169c0f3d3c2fb189ce8b987853d54e2 Mon Sep 17 00:00:00 2001 From: Torge Date: Fri, 5 Feb 2021 12:14:11 +0100 Subject: [PATCH 2/3] pprint used for debugging not needed anymore --- JavaExtractor/extract.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/JavaExtractor/extract.py b/JavaExtractor/extract.py index 7053f9f..43f9c74 100644 --- a/JavaExtractor/extract.py +++ b/JavaExtractor/extract.py @@ -11,9 +11,6 @@ from argparse import ArgumentParser from subprocess import Popen, PIPE, STDOUT, call -from pprint import pprint - - def get_immediate_subdirectories(a_dir): return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir) From 103362bb62b1a4ed55e435320ee7e448d65ab304 Mon Sep 17 00:00:00 2001 From: lidian <837997288@qq.com> Date: Wed, 20 Sep 2023 11:54:36 +0800 Subject: [PATCH 3/3] I added a batch_size to the extract file, allowing for batch processing of projects instead of loading them all at once. During batch processing, I also incorporated timeout handling. --- JavaExtractor/extract.py | 52 ++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/JavaExtractor/extract.py b/JavaExtractor/extract.py index 43f9c74..acb3ad8 100644 --- a/JavaExtractor/extract.py +++ b/JavaExtractor/extract.py @@ -3,30 +3,22 @@ import itertools import multiprocessing import os -import sys import shutil import subprocess from threading import Timer -import sys from argparse import ArgumentParser -from subprocess import Popen, PIPE, STDOUT, call - def get_immediate_subdirectories(a_dir): return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir) if os.path.isdir(os.path.join(a_dir, name))] - def ParallelExtractDir(args, tmpdir, dir_): - ExtractFeaturesForDir(args,tmpdir, dir_, "") - + ExtractFeaturesForDir(args, tmpdir, dir_, "") def ExtractFeaturesForDir(args, tmpdir, dir_, prefix): command = ['java', '-cp', args.jar, 'JavaExtractor.App', '--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width), '--dir', dir_, '--num_threads', str(args.num_threads)] - # print command - # os.system(command) kill = lambda process: process.kill() outputFileName = tmpdir + prefix + dir_.split('/')[-1] failed = False @@ -39,36 +31,35 @@ def ExtractFeaturesForDir(args, tmpdir, dir_, prefix): finally: timer.cancel() - if sleeper.poll() == 0: - if len(stderr) > 0: - print(sys.stderr, stderr, file=sys.stdout) - else: - print(sys.stderr, 'dir: ' + str(dir_) + ' was not completed in time', file=sys.stdout, flush=True) + if sleeper.poll() != 0: failed = True subdirs = get_immediate_subdirectories(dir_) for subdir in subdirs: ExtractFeaturesForDir(args, subdir, prefix + dir_.split('/')[-1] + '_') - if failed: - if os.path.exists(outputFileName): - os.remove(outputFileName) - + + if failed and os.path.exists(outputFileName): + os.remove(outputFileName) def ExtractFeaturesForDirsList(args, dirs): tmp_dir = f"./tmp/feature_extractor{os.getpid()}/" if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir, ignore_errors=True) os.makedirs(tmp_dir) - try: - p = multiprocessing.Pool(4) - p.starmap(ParallelExtractDir, zip(itertools.repeat(args),itertools.repeat(tmp_dir), dirs)) - #for dir in dirs: - # ExtractFeaturesForDir(args, dir, '') + + for i in range(0, len(dirs), args.batch_size): + batch_dirs = dirs[i:i + args.batch_size] + timeout_seconds = 60 # timeout setting + try: + with multiprocessing.Pool(4) as p: + result = p.starmap_async(ParallelExtractDir, zip(itertools.repeat(args), itertools.repeat(tmp_dir), batch_dirs)) + result.get(timeout=timeout_seconds) + except multiprocessing.TimeoutError: + continue + output_files = os.listdir(tmp_dir) for f in output_files: os.system("cat %s/%s" % (tmp_dir, f)) - finally: - shutil.rmtree(tmp_dir, ignore_errors=True) - + os.remove(os.path.join(tmp_dir, f)) if __name__ == '__main__': parser = ArgumentParser() @@ -78,6 +69,9 @@ def ExtractFeaturesForDirsList(args, dirs): parser.add_argument("-j", "--jar", dest="jar", required=True) parser.add_argument("-dir", "--dir", dest="dir", required=False) parser.add_argument("-file", "--file", dest="file", required=False) + # add a new parameter batch_size + parser.add_argument("-batch_size", "--batch_size", dest="batch_size", required=False, default=3, type=int) + args = parser.parse_args() if args.file is not None: @@ -86,9 +80,5 @@ def ExtractFeaturesForDirsList(args, dirs): os.system(command) elif args.dir is not None: subdirs = get_immediate_subdirectories(args.dir) - to_extract = subdirs - if len(subdirs) == 0: - to_extract = [args.dir.rstrip('/')] + to_extract = subdirs if subdirs else [args.dir.rstrip('/')] ExtractFeaturesForDirsList(args, to_extract) - -