def load_data(file_path: str, tokenizer: PreTrainedTokenizer = None):
klue_data = Path(file_path)
klue_text = klue_data.read_text().strip()
documents = klue_text.split("\n\n")
data_list = []
for doc in documents:
char_labels = []
token_labels = []
chars = []
sentence = ""
for line in doc.split("\n"):
if line.startswith("##"):
continue
token, tag = line.split("\t")
sentence += token
char_labels.append(tag)
chars.append(token)
offset_mappings = tokenizer(sentence, return_offsets_mapping=True)["offset_mapping"]
for offset in offset_mappings:
start, end = offset
if start == end == 0:
continue
token_labels.append(char_labels[start])
instance = {
"sentence": sentence,
"token_label": token_labels,
"char_label": char_labels,
"offset_mapping": offset_mappings
}
data_list.append(instance)
return data_list
labels = [
"B-PS",
"I-PS",
"B-LC",
"I-LC",
"B-OG",
"I-OG",
"B-DT",
"I-DT",
"B-TI",
"I-TI",
"B-QT",
"I-QT",
"O",
]
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}
class NerDataset(Dataset):
def __init__(
self,
tokenizer: PreTrainedTokenizer,
examples: List,
shuffle: bool = False,
**kwargs
):
self.dataset = examples
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.dataset)
def __getitem__(self, index):
instance = self.dataset[index]
return instance
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
examples = load_data(file_path, tokenizer)
print(examples[0])
dataset = NerDataset(
tokenizer=tokenizer,
examples=examples,
max_length=max_length
)
data_loader = DataLoader(
dataset=dataset,
collate_fn=collate_fn
)
for batch in data_loader:
print(batch)
break