Spaces:
Runtime error
Runtime error
| # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import argparse | |
| import numpy as np | |
| import pandas as pd | |
| def construct_negatives(input_file, output_file, num_passages, num_negatives): | |
| qrels = pd.read_csv(input_file, delimiter="\t", header=None) | |
| with open(output_file, "w") as f: | |
| for i in range(len(qrels)): | |
| query_id, rel_passage_id = qrels[0][i], qrels[2][i] | |
| negatives = np.random.randint(num_passages, size=num_negatives) | |
| output_ids = [query_id, rel_passage_id] + negatives.tolist() | |
| output_str = [str(id_) for id_ in output_ids] | |
| print("\t".join(output_str), file=f) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Negative passages construction") | |
| parser.add_argument("--data", type=str, default="msmarco_dataset", help="path to folder with data") | |
| parser.add_argument("--num_passages", type=int, default=8841823, help="total number of passages") | |
| parser.add_argument("--num_negatives", type=int, default=10, help="number of negatives per positive") | |
| args = parser.parse_args() | |
| for mode in ["train", "dev"]: | |
| construct_negatives( | |
| input_file=f"{args.data}/qrels.{mode}.tsv", | |
| output_file=f"{args.data}/query2passages.{mode}.tsv", | |
| num_passages=args.num_passages, | |
| num_negatives=args.num_negatives, | |
| ) | |
| if __name__ == '__main__': | |
| main() | |