from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors from tokenizers.models import WordPiece from tokenizers.trainers import WordPieceTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.processors import TemplateProcessing def create_and_train_tokenizer(dataframe, vocab_size=1000, min_frequency=2): """ Create and train a custom WordPiece tokenizer based on the text in our dataset """ # Initialize a tokenizer with a WordPiece model tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) # Use pre-tokenizer to split text into words tokenizer.pre_tokenizer = Whitespace() # Initialize the trainer trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], min_frequency=min_frequency ) # Prepare the corpus for training corpus = dataframe['text'].tolist() # Train the tokenizer tokenizer.train_from_iterator(corpus, trainer) # Add post-processing for BERT-style inputs tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B [SEP]", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")) ], ) # Add padding capability tokenizer.enable_padding(pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]") # Add truncation capability tokenizer.enable_truncation(max_length=128) return tokenizer def convert_to_huggingface_tokenizer(custom_tokenizer, save_path='./tokenizer'): """ Convert the custom tokenizer to a Hugging Face PreTrainedTokenizerFast """ # Save the custom tokenizer custom_tokenizer.save(f"tokenizer.json") # Create a Hugging Face tokenizer from the saved file hf_tokenizer = PreTrainedTokenizerFast( tokenizer_file=f"tokenizer.json", unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]", sep_token="[SEP]", mask_token="[MASK]" ) # Save the Hugging Face tokenizer hf_tokenizer.save_pretrained(save_path) print(f"Tokenizer saved.") return hf_tokenizer # Example usage: # Load the training data train_df = pd.read_csv('train_data.csv') # Create and train the tokenizer custom_tokenizer = create_and_train_tokenizer(train_df) # Convert to Hugging Face format hf_tokenizer = convert_to_huggingface_tokenizer(custom_tokenizer) # Test the tokenizer sample_text = "The temperature in London in July is" encoded = hf_tokenizer(sample_text) print(encoded) print(hf_tokenizer.decode(encoded['input_ids'])) # Tokenizer vocabulary size print(f"Vocabulary size: {hf_tokenizer.vocab_size}")