| | import multiprocessing |
| | from itertools import chain |
| | from datasets import load_dataset |
| | from transformers import AutoTokenizer |
| |
|
| | from huggingface_hub import HfApi |
| |
|
| |
|
| | class DatasetBuilder: |
| | def __init__( |
| | self, |
| | dataset_name, |
| | seq_len=8192, |
| | num_cpu=None, |
| | hf_account_repo=None, |
| | tokenizer="EleutherAI/gpt-neox-20b", |
| | ): |
| | self.dataset_name = dataset_name |
| | self.seq_len = seq_len |
| | self.num_cpu = num_cpu or multiprocessing.cpu_count() |
| | self.hf_account_repo = hf_account_repo |
| | self.tokenizer = tokenizer |
| |
|
| | def build_dataset(self): |
| | tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) |
| | train_dataset = load_dataset(self.dataset_name, split="train", streaming=True) |
| | dataset = train_dataset.shuffle() |
| |
|
| | def tokenize_function(example): |
| | return tokenizer([t + tokenizer.eos_token for t in example["text"]]) |
| |
|
| | tokenized_dataset = dataset.map( |
| | tokenize_function, |
| | batched=True, |
| | |
| | remove_columns=["text"], |
| | |
| | ) |
| |
|
| | block_size = self.seq_len |
| |
|
| | def group_texts(examples): |
| | concatenated_examples = { |
| | k: list(chain(*examples[k])) for k in examples.keys() |
| | } |
| | total_length = len(concatenated_examples[list(examples.keys())[0]]) |
| |
|
| | if total_length >= block_size: |
| | total_length = (total_length // block_size) * block_size |
| |
|
| | result = { |
| | k: [ |
| | t[i : i + block_size] |
| | for i in range(0, total_length, block_size) |
| | ] |
| | for k, t in concatenated_examples.items() |
| | } |
| |
|
| | return result |
| |
|
| | train_tokenized_dataset = tokenized_dataset.map( |
| | group_texts, batched=True, |
| | ) |
| |
|
| | |
| | if self.hf_account_repo: |
| | |
| | hf_api = HfApi() |
| | hf_api.upload_file( |
| | path_or_fileobj= "TOKENIZED_DATASET", |
| | path_in_repo="README.md", |
| | repo_id=self.hf_account_repo, |
| | repo_type="dataset" |
| | ) |
| |
|
| | return train_tokenized_dataset |
| |
|
| | |
| |
|
| | builder = DatasetBuilder( |
| | dataset_name="the_pile_books3", |
| | seq_len=8192, |
| | |
| | hf_account_repo="kye/thepilebooks3-gptneox-8k", |
| | tokenizer="EleutherAI/gpt-neox-20b", |
| | ) |
| |
|
| | dataset = builder.build_dataset() |
| |
|