Commit b59fee38 authored by Jerome Wuerf's avatar Jerome Wuerf
Browse files

index on tira: c7f07e87 Remove multithreading for initial retrieval

parent c7f07e87
......@@ -29,8 +29,10 @@ services:
- "9300:9300"
volumes:
- /mnt/data/elastic:/usr/share/elasticsearch/data
- ./conifg:/conifg
environment:
- discovery.type=single-node
- logger.level=DEBUG
healthcheck:
test:
[
......
......@@ -5,7 +5,7 @@ from retrieval import (Retrieval, MaximalMarginalRelevanceReranking, StructuralD
from utils import (Configuration, SubCommands, RerankingOptions, parse_cli_args, read_data_to_index,
read_results, read_unranked, read_sentences, read_topics, write_terc_file)
import logging
import pickle
import time
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level='INFO')
logging.Logger.manager.loggerDict["elastic_transport.transport"].disabled = True
......@@ -38,6 +38,7 @@ class App:
).index_to_es()
def _retrieval(self) -> None:
topics = read_topics(
Path(self.config['INPUT_PATH'],
'topics.xml'),
......@@ -50,6 +51,11 @@ class App:
self.config['TOPIC_NRB'])
self.logger.info('Read unranked!')
else:
if self.config['WAIT_FOR_ES']:
self.logger.info('Waiting 5 minutes for elasticsearch ...')
time.sleep(300)
self.logger.info('Waited for elasticsearch!')
retrieved_results = Retrieval(topics,
self.config['RUN_NAME'],
self.config['MIN_LENGTH_FACTOR'],
......@@ -58,7 +64,7 @@ class App:
self.config['NRB_PREMISES_PER_CONCLUSION']
).retrieve()
# Pattern matching is only available pre Python 3.10 :(
# Pattern matching is only available since Python 3.10 :(
reranker = None
if self.config['RERANKING'] == RerankingOptions.MAXIMAL_MARGINAL_RELEVANCE.value:
reranker = MaximalMarginalRelevanceReranking(
......
......@@ -90,7 +90,7 @@ class Retrieval:
conclusions_token_lengths = [len(analyzed_conc['tokens']) for analyzed_conc in self._analyze_bulk([conc['_source']['sentence_text'] for conc in conclusions])]
premises_per_conclusion = map(lambda x: (x[0], self._get_premises(x[0],x[1])),
tqdm(zip(conclusions,conclusions_token_lengths)))
tqdm(zip(conclusions,conclusions_token_lengths), total=len(conclusions)))
for idx, conclusion_and_premises in enumerate(premises_per_conclusion):
premise_per_conclusion_per_topic[topic_nrb][idx] = {}
......
......@@ -26,7 +26,8 @@ class Configuration():
'RERANKING',
'REUSE_UNRANKED',
'LAMBDA_CONCLUSIONS',
'LAMBDA_PREMISES'],
'LAMBDA_PREMISES',
'WAIT_FOR_ES'],
}
def __init__(self, args: Namespace):
......@@ -60,7 +61,8 @@ class Configuration():
args.reranking,
args.reuse_unranked,
args.lambda_conclusions,
args.lambda_premises
args.lambda_premises,
args.wait_for_es
]
config = dict(zip(self.keys[self.command], args_list))
......
......@@ -85,6 +85,9 @@ def parse_cli_args() -> argparse.Namespace:
type=float,
required=False,
default=0.5)
parser_retrieval.add_argument('--wait-for-es',
action='store_true')
parser_retrieval.add_argument('run_name', type=str, help=Text.run_name)
parser_retrieval.add_argument('input_path', type=str, help=Text.input_path)
parser_retrieval.add_argument('output_path', type=str, help=Text.output_path)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment