Source code for lisbet.training.preprocessing
"""Preprocessing functions for model training."""
import logging
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split
[docs]
def split_multi_records(
multi_records,
dev_ratio,
dev_seed,
task_ids,
task_data,
):
"""Split records into train and dev sets."""
# Build task to data mapping, by default use all data for every task
task_data_map = {task_id: list(range(len(multi_records))) for task_id in task_ids}
# Update task to data mapping, if requested
if task_data is not None:
logging.debug("Updating task to data mapping")
pattern = r"(\b(?:" + r"|".join(task_ids) + r")\b):(\[(.*?)\])"
matches = re.findall(pattern, task_data)
task_data_map |= {
key: [int(x) for x in vals.split(",")] for key, _, vals in matches
}
logging.debug(task_data_map)
# Create the lists of records for each task
train_rec = defaultdict(list)
dev_rec = defaultdict(list)
# Assign records
for task_id, dataidx_lst in task_data_map.items():
for dataidx in dataidx_lst:
# Locate records for the current task
records = multi_records[dataidx]
# Split records
if dev_ratio is not None:
train_rec_task, dev_rec_task = train_test_split(
records,
test_size=dev_ratio,
random_state=dev_seed,
)
# Assign records to train and dev sets
train_rec[task_id].extend(train_rec_task)
dev_rec[task_id].extend(dev_rec_task)
else:
# Assign all records to train sets
train_rec[task_id].extend(records)
logging.info(
"Assigning records from dataset no. %d to task %s", dataidx, task_id
)
logging.info("Final training set size = %d", len(train_rec[task_id]))
logging.debug([rec.id for rec in train_rec[task_id]])
logging.info("Final dev set size = %d", len(dev_rec[task_id]))
logging.debug([rec.id for rec in dev_rec[task_id]])
return train_rec, dev_rec