| import torch |
| from torch.utils.data import Dataset |
|
|
| import json |
| import os |
| import collections |
|
|
| class IntentDataset(Dataset): |
| def __init__(self, loc, tokenizer, mode, toy=False, max_length=180): |
| ''' |
| You can fine-tune a model with your own data!! Feel free to create (or collect!) your own utterances |
| and give it a shot! |
| |
| |
| |
| loc: relative directory where the data lies |
| tokenizer: huggingface tokenizer to preprocess utterances |
| mode: one of train, val, test (should match the respective *.json files) |
| toy: load a very small amount of data (for debugging purposes) |
| max_length:max length of tokenized input |
| ''' |
| self.tokenizer = tokenizer |
| self.mode = mode |
| self.max_length=max_length |
| |
| with open(os.path.join(loc, 'all_intents.json'), 'r') as all_intents_json: |
| self.all_intents = json.load(all_intents_json) |
| |
| |
| with open(os.path.join(loc, mode + '.json'), 'r') as json_data: |
| self.all_data = json.load(json_data) |
| |
| if toy: |
| self.all_data = self.all_data[:10] |
| |
| print(f"Loaded Intent detection dataset. {len(self.all_data)} examples. ({mode}). {'Toy example' if toy else ''}") |
| |
| def __len__(self): |
| return len(self.all_data) |
| |
| def __getitem__(self, index): |
| data_item = self.all_data[index] |
| |
| if len(data_item) == 3: |
| tokenized_input = self.tokenizer(data_item[0], data_item[1], return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length) |
| else: |
| tokenized_input = self.tokenizer(data_item[0], return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length) |
|
|
| output_item = { |
| 'input_ids': tokenized_input['input_ids'].squeeze(0), |
| 'attention_mask': tokenized_input['attention_mask'].squeeze(0), |
| 'label': torch.tensor(self.all_intents.index(data_item[-1])) |
| } |
| if 'token_type_ids' in tokenized_input: |
| output_item['token_type_ids'] = tokenized_input['token_type_ids'].squeeze(0), |
| return output_item |
| |
| |
|
|