#@title Installs

!pip install pydot --quiet
!pip install transformers --quiet
!pip install sentencepiece --quiet
!pip install nltk --quiet

#@title Imports

import numpy as np

import tensorflow as tf
from tensorflow import keras

import tensorflow_datasets as tfds

import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer

import os
import nltk

import matplotlib.pyplot as plt

import re
import textwrap

from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
from transformers import GPT2Tokenizer, TFOPTForCausalLM

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.

True

!!curl -O http://www.manythings.org/anki/deu-eng.zip
!!unzip deu-eng.zip

['Archive:  deu-eng.zip',
 '  inflating: deu.txt                 ',
 '  inflating: _about.txt              ']

embed_dim = 100  # Embedding dimensions for vectors and LSTMs.
num_samples = 10000  # Number of examples to consider.

# Path to the data txt file on disk.
data_path = "deu.txt"

# Vocabulary sizes that we'll use:
english_vocab_size = 2000
german_vocab_size = 3000

# Initialize empty lists to hold preprocessed text pairs
input_texts = []   # English sentences
target_texts = []  # German sentences

# Initialize max length trackers for both input and output sequences
max_input_length = -1
max_output_length = -1

# Open and read the dataset file (tab-separated, with 3 columns per line)
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")  # Split the file into individual lines

# Iterate through each line, up to the specified number of samples
for line in lines[: min(num_samples, len(lines) - 1)]:
    # Each line has the format: English_text \t German_text \t metadata
    input_text, target_text, _ = line.split("\t")

    # Tokenize the English and German sentences into word tokens
    tokenized_source_text = nltk.word_tokenize(input_text, language='english')
    tokenized_target_text = nltk.word_tokenize(target_text, language='german')

    # Track the longest tokenized sequence (for padding or model input sizing)
    if len(tokenized_source_text) > max_input_length:
        max_input_length = len(tokenized_source_text)

    if len(tokenized_target_text) > max_output_length:
        max_output_length = len(tokenized_target_text)

    # Reconstruct tokenized text into lowercase, space-separated strings
    source_text = (' '.join(tokenized_source_text)).lower()
    target_text = (' '.join(tokenized_target_text)).lower()

    # Add processed text pairs to the lists
    input_texts.append(source_text)
    target_texts.append(target_text)

# Create English vocabulary using CountVectorizer (Bag-of-Words style)
vectorizer_english = CountVectorizer(max_features=english_vocab_size)
vectorizer_english.fit(input_texts)
vocab_english = vectorizer_english.get_feature_names_out()  # List of top English words

# Create German vocabulary using CountVectorizer
vectorizer_german = CountVectorizer(max_features=german_vocab_size)
vectorizer_german.fit(target_texts)
vocab_german = vectorizer_german.get_feature_names_out()  # List of top German words

# Output useful statistics for later model setup
print('Maximum source input length: ', max_input_length)
print('Maximum target output length: ', max_output_length)

Maximum source input length:  6
Maximum target output length:  10

input_texts[:2]

['go .', 'hi .']

target_texts[:2]

['geh .', 'hallo !']

max_encoder_seq_length = max_input_length                 # from your scan, = 6
max_decoder_seq_length = max_output_length + 2            # room for <start> and <end>, 10 + 2

# ---- Source (English) vocab <-> id mappings ---------------------------------

source_id_vocab_dict = {}  # maps integer IDs -> tokens
source_vocab_id_dict = {}  # maps tokens -> integer IDs

# Assign IDs to each English vocab item (0..english_vocab_size-1)
for sid, svocab in enumerate(vocab_english):
    source_id_vocab_dict[sid] = svocab
    source_vocab_id_dict[svocab] = sid

# Reserve extra IDs for special tokens (placed AFTER the base vocab range)
source_id_vocab_dict[english_vocab_size] = "<unk>"  # unknown token for OOV words
source_id_vocab_dict[english_vocab_size + 1] = "<pad>"  # padding token

# And the reverse mapping for source language
source_vocab_id_dict["<unk>"] = english_vocab_size
source_vocab_id_dict["<pad>"] = english_vocab_size + 1


# ---- Target (German) vocab <-> id mappings ----------------------------------

target_id_vocab_dict = {}  # maps integer IDs -> tokens
target_vocab_id_dict = {}  # maps tokens -> integer IDs

# Assign IDs to each German vocab item (0..german_vocab_size-1)
for tid, tvocab in enumerate(vocab_german):
    target_id_vocab_dict[tid] = tvocab
    target_vocab_id_dict[tvocab] = tid

# Add special tokens for the target side:
# <unk>  : unknown token (OOV handling)
# <start>: marks decoder start-of-sequence
# <end>  : marks decoder end-of-sequence
# <pad>  : padding for batching/sequencing alignment
target_id_vocab_dict[german_vocab_size]     = "<unk>"
target_id_vocab_dict[german_vocab_size + 1] = "<start>"
target_id_vocab_dict[german_vocab_size + 2] = "<end>"
target_id_vocab_dict[german_vocab_size + 3] = "<pad>"

# Reverse mapping for target language
target_vocab_id_dict["<unk>"]   = german_vocab_size
target_vocab_id_dict["<start>"] = german_vocab_size + 1
target_vocab_id_dict["<end>"]   = german_vocab_size + 2
target_vocab_id_dict["<pad>"]   = german_vocab_size + 3

def convert_text_to_data(texts,
                         vocab_id_dict,
                         max_length=20,
                         type=None,
                         train_test_vector=None,
                         samples=100000):
    """
    Convert tokenized (space-delimited) texts into fixed-length ID sequences
    with task-specific special tokens (start/end/pad) and an optional
    train/test split mask.

    Args:
        texts: iterable of strings, already tokenized as space-separated tokens.
        vocab_id_dict: dict mapping token -> id. Must include "<unk>" and:
            - For source: "<pad>"
            - For target: "<start>", "<end>", "<pad>"
        max_length: output sequence length (after padding/truncation).
        type: one of {'input_source', 'input_target', 'output_target'}.
        train_test_vector: optional boolean mask (True=train, False=test)
                           aligned to the first len(texts[:samples]) items.
        samples: max number of examples to process.

    Returns:
        (np.array(train_data), np.array(test_data))
    """

    # Require an explicit type to avoid accidental misuse
    if type == None:
        raise ValueError("'type' is not defined. Please choose from: input_source, input_target, output_target.")

    train_data = []
    test_data = []

    # Process up to `samples` texts
    for text_num, text in enumerate(texts[:samples]):

        sentence_ids = []

        # Convert each token to its id; use <unk> for OOV
        for token in text.split():
            if token in vocab_id_dict.keys():
                sentence_ids.append(vocab_id_dict[token])
            else:
                sentence_ids.append(vocab_id_dict["<unk>"])

        # We rely on specials being appended to the end of the vocab.
        # That is:
        #   SOURCE: ... , <unk>=V, <pad>=V+1   => vocab_size = V+2; pad = vocab_size-1
        #   TARGET: ... , <unk>=V, <start>=V+1, <end>=V+2, <pad>=V+3
        #           => vocab_size = V+4; start=vocab_size-3, end=vocab_size-2, pad=vocab_size-1
        vocab_size = len(vocab_id_dict.keys())

        # Build final id sequence per task type, then pad/truncate to `max_length`.
        if type == 'input_source':
            # Encoder input: raw source tokens + PADs
            ids = (sentence_ids + [vocab_size - 1] * max_length)[:max_length]

        elif type == 'input_target':
            # Decoder input (teacher forcing): <START> + target tokens + <END> + PADs
            ids = ([vocab_size - 3] + sentence_ids + [vocab_size - 2] + [vocab_size - 1] * max_length)[:max_length]

        elif type == 'output_target':
            # Decoder output (shifted target): target tokens + <END> + PADs
            ids = (sentence_ids + [vocab_size - 2] + [vocab_size - 1] * max_length)[:max_length]

        # If a split mask is provided and this item is marked False -> test set
        if train_test_vector is not None and not train_test_vector[text_num]:
            test_data.append(ids)
        else:
            train_data.append(ids)

    return np.array(train_data), np.array(test_data)


# Random 80/20 split mask for (up to) 10,000 items, any num > 0.2 will return "True"
train_test_split_vector = (np.random.uniform(size=10000) > 0.2)

# Build encoder inputs (source side)
train_source_input_data, test_source_input_data = convert_text_to_data(
    input_texts,
    source_vocab_id_dict,
    type='input_source',
    max_length=max_encoder_seq_length,
    train_test_vector=train_test_split_vector
)

# Build decoder inputs (target side, with <START> ... <END>)
train_target_input_data, test_target_input_data = convert_text_to_data(
    target_texts,
    target_vocab_id_dict,
    type='input_target',
    max_length=max_decoder_seq_length,
    train_test_vector=train_test_split_vector
)

# Build decoder outputs (target side, shifted, ends with <END>)
train_target_output_data, test_target_output_data = convert_text_to_data(
    target_texts,
    target_vocab_id_dict,
    type='output_target',
    max_length=max_decoder_seq_length,
    train_test_vector=train_test_split_vector
)

train_source_input_data[:2]

array([[ 765, 2000, 2001, 2001, 2001, 2001],
       [ 828, 2000, 2001, 2001, 2001, 2001]])

train_target_input_data[:2]

array([[3001,  928, 3000, 3002, 3003, 3003, 3003, 3003, 3003, 3003, 3003,
        3003, 3003],
       [3001, 1118, 1098, 3000, 3002, 3003, 3003, 3003, 3003, 3003, 3003,
        3003, 3003]])

train_target_output_data[:2]

array([[ 928, 3000, 3002, 3003, 3003, 3003, 3003, 3003, 3003, 3003, 3003,
        3003, 3003],
       [1118, 1098, 3000, 3002, 3003, 3003, 3003, 3003, 3003, 3003, 3003,
        3003, 3003]])

def create_translation_model_no_att(encode_vocab_size, decode_vocab_size, embed_dim):
    """
    Build a basic seq2seq (Encoder-Decoder) LSTM translation model WITHOUT attention.
    Uses teacher forcing during training: decoder sees <start> + tokens and predicts next tokens.

    Args:
        encode_vocab_size: size of source (encoder) vocab including specials
        decode_vocab_size: size of target (decoder) vocab including specials
        embed_dim: dimension for both embeddings and LSTM hidden size (for simplicity)

    Returns:
        A compiled tf.keras.Model that maps:
            [source_input_ids, target_input_ids] -> decoder token distributions over decode_vocab_size
    """

    # ----- Inputs (padded ID sequences) -----
    # Encoder input shape: (batch, max_encoder_seq_length)
    source_input_no_att = tf.keras.layers.Input(
        shape=(max_encoder_seq_length,), dtype='int64', name='source_input_no_att'
    )
    # Decoder input shape: (batch, max_decoder_seq_length)
    # Contains <start> ... tokens ... <end> + PADs
    target_input_no_att = tf.keras.layers.Input(
        shape=(max_decoder_seq_length,), dtype='int64', name='target_input_no_att'
    )

    # ----- Embedding layers (token IDs -> dense vectors) -----
    # NOTE: consider mask_zero=True to ignore PADs automatically in LSTM
    source_embedding_layer_no_att = tf.keras.layers.Embedding(
        input_dim=encode_vocab_size, output_dim=embed_dim, name='source_embedding_layer_no_att'
        # , mask_zero=True
    )
    target_embedding_layer_no_att = tf.keras.layers.Embedding(
        input_dim=decode_vocab_size, output_dim=embed_dim, name='target_embedding_layer_no_att'
        # , mask_zero=True
    )

    # Embedded sequences:
    # source_embeddings: (batch, max_enc_len, embed_dim)
    source_embeddings_no_att = source_embedding_layer_no_att(source_input_no_att)
    # target_embeddings: (batch, max_dec_len, embed_dim)
    target_embeddings_no_att = target_embedding_layer_no_att(target_input_no_att)

    # ----- Encoder LSTM -----
    # return_sequences=True -> provide hidden state at each time step (not used further here)
    # return_state=True -> also return the final (h, c) states to initialize the decoder
    encoder_lstm_layer_no_att = tf.keras.layers.LSTM(
        embed_dim, return_sequences=True, return_state=True, name='encoder_lstm_layer_no_att'
    )
    encoder_out_no_att, encoder_state_h_no_att, encoder_state_c_no_att = \
        encoder_lstm_layer_no_att(source_embeddings_no_att)
    # Shapes:
    #  encoder_out_no_att: (batch, max_enc_len, embed_dim)  [unused downstream]
    #  encoder_state_h_no_att: (batch, embed_dim)
    #  encoder_state_c_no_att: (batch, embed_dim)

    # ----- Decoder LSTM (teacher forcing) -----
    # Initialized with encoder final states so it conditions on the source sentence.
    decoder_lstm_layer_no_att = tf.keras.layers.LSTM(
        embed_dim, return_sequences=True, return_state=False, name='decoder_lstm_layer_no_att'
    )
    # decoder_lstm_out: (batch, max_dec_len, embed_dim)
    decoder_lstm_out_no_att = decoder_lstm_layer_no_att(
        target_embeddings_no_att, initial_state=[encoder_state_h_no_att, encoder_state_c_no_att]
    )

    # ----- Token classification over target vocab at each time step -----
    # Dense time-distributed by broadcasting over the sequence dimension:
    # output: (batch, max_dec_len, decode_vocab_size), softmax over vocab axis
    target_classification_no_att = tf.keras.layers.Dense(
        decode_vocab_size, activation='softmax', name='classification_no_att'
    )(decoder_lstm_out_no_att)

    # ----- Assemble & compile -----
    translation_model_no_att = tf.keras.models.Model(
        inputs=[source_input_no_att, target_input_no_att],
        outputs=[target_classification_no_att]
    )

    # Using SparseCategoricalCrossentropy so labels are integer IDs (no one-hot needed).
    # Expect labels shaped (batch, max_dec_len) or (batch, max_dec_len, 1).
    translation_model_no_att.compile(
        optimizer="Adam",
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return translation_model_no_att

encode_vocab_size = len(source_id_vocab_dict.keys())
decode_vocab_size = len(target_id_vocab_dict.keys())

translation_model_no_att = create_translation_model_no_att(encode_vocab_size, decode_vocab_size, embed_dim) # embed_dim = 100

translation_model_no_att.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
==================================================================================================
 source_input_no_att (Input  [(None, 6)]                  0         []                            
 Layer)                                                                                           
                                                                                                  
 target_input_no_att (Input  [(None, 13)]                 0         []                            
 Layer)                                                                                           
                                                                                                  
 source_embedding_layer_no_  (None, 6, 100)               200200    ['source_input_no_att[0][0]'] 
 att (Embedding)                                                                                  
                                                                                                  
 target_embedding_layer_no_  (None, 13, 100)              300400    ['target_input_no_att[0][0]'] 
 att (Embedding)                                                                                  
                                                                                                  
 encoder_lstm_layer_no_att   [(None, 6, 100),             80400     ['source_embedding_layer_no_at
 (LSTM)                       (None, 100),                          t[0][0]']                     
                              (None, 100)]                                                        
                                                                                                  
 decoder_lstm_layer_no_att   (None, 13, 100)              80400     ['target_embedding_layer_no_at
 (LSTM)                                                             t[0][0]',                     
                                                                     'encoder_lstm_layer_no_att[0]
                                                                    [1]',                         
                                                                     'encoder_lstm_layer_no_att[0]
                                                                    [2]']                         
                                                                                                  
 classification_no_att (Den  (None, 13, 3004)             303404    ['decoder_lstm_layer_no_att[0]
 se)                                                                [0]']                         
                                                                                                  
==================================================================================================
Total params: 964804 (3.68 MB)
Trainable params: 964804 (3.68 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________

translation_model_no_att.predict(x=[train_source_input_data, train_target_input_data]).shape

251/251 [==============================] - 2s 3ms/step

(8005, 13, 3004)

translation_model_no_att.fit(x=[train_source_input_data, train_target_input_data],
                             y=train_target_output_data,
                             validation_data=([test_source_input_data, test_target_input_data],
                                              test_target_output_data),
                             epochs=40)

Epoch 1/40
251/251 [==============================] - 14s 39ms/step - loss: 2.5500 - accuracy: 0.6432 - val_loss: 1.7015 - val_accuracy: 0.7366
Epoch 2/40
251/251 [==============================] - 3s 12ms/step - loss: 1.5711 - accuracy: 0.7612 - val_loss: 1.5078 - val_accuracy: 0.7731
Epoch 3/40
251/251 [==============================] - 2s 9ms/step - loss: 1.4231 - accuracy: 0.7736 - val_loss: 1.4108 - val_accuracy: 0.7754
Epoch 4/40
251/251 [==============================] - 2s 9ms/step - loss: 1.3171 - accuracy: 0.7864 - val_loss: 1.3232 - val_accuracy: 0.7921
Epoch 5/40
251/251 [==============================] - 2s 9ms/step - loss: 1.2263 - accuracy: 0.8000 - val_loss: 1.2619 - val_accuracy: 0.8003
Epoch 6/40
251/251 [==============================] - 2s 8ms/step - loss: 1.1551 - accuracy: 0.8086 - val_loss: 1.2080 - val_accuracy: 0.8088
Epoch 7/40
251/251 [==============================] - 2s 8ms/step - loss: 1.0942 - accuracy: 0.8148 - val_loss: 1.1693 - val_accuracy: 0.8119
Epoch 8/40
251/251 [==============================] - 2s 9ms/step - loss: 1.0427 - accuracy: 0.8203 - val_loss: 1.1368 - val_accuracy: 0.8183
Epoch 9/40
251/251 [==============================] - 2s 8ms/step - loss: 0.9954 - accuracy: 0.8265 - val_loss: 1.1053 - val_accuracy: 0.8227
Epoch 10/40
251/251 [==============================] - 2s 7ms/step - loss: 0.9516 - accuracy: 0.8315 - val_loss: 1.0785 - val_accuracy: 0.8267
Epoch 11/40
251/251 [==============================] - 2s 7ms/step - loss: 0.9099 - accuracy: 0.8357 - val_loss: 1.0560 - val_accuracy: 0.8293
Epoch 12/40
251/251 [==============================] - 2s 9ms/step - loss: 0.8703 - accuracy: 0.8403 - val_loss: 1.0382 - val_accuracy: 0.8324
Epoch 13/40
251/251 [==============================] - 2s 8ms/step - loss: 0.8342 - accuracy: 0.8440 - val_loss: 1.0235 - val_accuracy: 0.8337
Epoch 14/40
251/251 [==============================] - 2s 8ms/step - loss: 0.7998 - accuracy: 0.8478 - val_loss: 1.0087 - val_accuracy: 0.8358
Epoch 15/40
251/251 [==============================] - 2s 7ms/step - loss: 0.7668 - accuracy: 0.8518 - val_loss: 0.9961 - val_accuracy: 0.8387
Epoch 16/40
251/251 [==============================] - 2s 8ms/step - loss: 0.7349 - accuracy: 0.8567 - val_loss: 0.9833 - val_accuracy: 0.8411
Epoch 17/40
251/251 [==============================] - 2s 7ms/step - loss: 0.7046 - accuracy: 0.8603 - val_loss: 0.9760 - val_accuracy: 0.8416
Epoch 18/40
251/251 [==============================] - 2s 8ms/step - loss: 0.6747 - accuracy: 0.8633 - val_loss: 0.9628 - val_accuracy: 0.8443
Epoch 19/40
251/251 [==============================] - 2s 7ms/step - loss: 0.6456 - accuracy: 0.8674 - val_loss: 0.9549 - val_accuracy: 0.8462
Epoch 20/40
251/251 [==============================] - 2s 7ms/step - loss: 0.6187 - accuracy: 0.8709 - val_loss: 0.9460 - val_accuracy: 0.8471
Epoch 21/40
251/251 [==============================] - 2s 7ms/step - loss: 0.5921 - accuracy: 0.8744 - val_loss: 0.9416 - val_accuracy: 0.8476
Epoch 22/40
251/251 [==============================] - 2s 7ms/step - loss: 0.5669 - accuracy: 0.8790 - val_loss: 0.9313 - val_accuracy: 0.8488
Epoch 23/40
251/251 [==============================] - 2s 8ms/step - loss: 0.5415 - accuracy: 0.8824 - val_loss: 0.9261 - val_accuracy: 0.8510
Epoch 24/40
251/251 [==============================] - 2s 8ms/step - loss: 0.5182 - accuracy: 0.8855 - val_loss: 0.9194 - val_accuracy: 0.8517
Epoch 25/40
251/251 [==============================] - 2s 8ms/step - loss: 0.4947 - accuracy: 0.8895 - val_loss: 0.9177 - val_accuracy: 0.8538
Epoch 26/40
251/251 [==============================] - 2s 7ms/step - loss: 0.4730 - accuracy: 0.8936 - val_loss: 0.9160 - val_accuracy: 0.8524
Epoch 27/40
251/251 [==============================] - 2s 7ms/step - loss: 0.4516 - accuracy: 0.8975 - val_loss: 0.9107 - val_accuracy: 0.8540
Epoch 28/40
251/251 [==============================] - 2s 7ms/step - loss: 0.4316 - accuracy: 0.9010 - val_loss: 0.9042 - val_accuracy: 0.8553
Epoch 29/40
251/251 [==============================] - 2s 7ms/step - loss: 0.4122 - accuracy: 0.9047 - val_loss: 0.9080 - val_accuracy: 0.8559
Epoch 30/40
251/251 [==============================] - 2s 7ms/step - loss: 0.3933 - accuracy: 0.9082 - val_loss: 0.9039 - val_accuracy: 0.8573
Epoch 31/40
251/251 [==============================] - 2s 7ms/step - loss: 0.3751 - accuracy: 0.9122 - val_loss: 0.9028 - val_accuracy: 0.8581
Epoch 32/40
251/251 [==============================] - 2s 7ms/step - loss: 0.3581 - accuracy: 0.9157 - val_loss: 0.8979 - val_accuracy: 0.8591
Epoch 33/40
251/251 [==============================] - 2s 8ms/step - loss: 0.3411 - accuracy: 0.9194 - val_loss: 0.9017 - val_accuracy: 0.8594
Epoch 34/40
251/251 [==============================] - 2s 7ms/step - loss: 0.3255 - accuracy: 0.9220 - val_loss: 0.9014 - val_accuracy: 0.8601
Epoch 35/40
251/251 [==============================] - 2s 8ms/step - loss: 0.3099 - accuracy: 0.9252 - val_loss: 0.8974 - val_accuracy: 0.8611
Epoch 36/40
251/251 [==============================] - 2s 8ms/step - loss: 0.2947 - accuracy: 0.9291 - val_loss: 0.9053 - val_accuracy: 0.8595
Epoch 37/40
251/251 [==============================] - 2s 7ms/step - loss: 0.2811 - accuracy: 0.9323 - val_loss: 0.9030 - val_accuracy: 0.8602
Epoch 38/40
251/251 [==============================] - 2s 7ms/step - loss: 0.2683 - accuracy: 0.9353 - val_loss: 0.9009 - val_accuracy: 0.8611
Epoch 39/40
251/251 [==============================] - 2s 7ms/step - loss: 0.2556 - accuracy: 0.9382 - val_loss: 0.9050 - val_accuracy: 0.8617
Epoch 40/40
251/251 [==============================] - 2s 7ms/step - loss: 0.2428 - accuracy: 0.9410 - val_loss: 0.9092 - val_accuracy: 0.8613

<tf_keras.src.callbacks.History at 0x7a1e3b0629c0>

def create_translation_model_with_att(encode_vocab_size, decode_vocab_size, embed_dim):
    """
    Seq2Seq (Encoder-Decoder) with additive/luong-style 'dot' Attention via Keras Attention layer.
    Teacher forcing during training: decoder sees <start>...<end>, predicts next tokens.

    Args:
        encode_vocab_size: source vocab size (incl. specials)
        decode_vocab_size: target vocab size (incl. specials)
        embed_dim: embedding size and LSTM hidden size

    Returns:
        Compiled tf.keras.Model:
          inputs  -> [source_ids (B, T_enc), target_ids (B, T_dec)]
          outputs -> token distributions over target vocab (B, T_dec, V_tgt)
    """

    # ----- Inputs -----
    # Source (encoder) ids: (batch, max_encoder_seq_length)
    source_input_with_att = tf.keras.layers.Input(
        shape=(max_encoder_seq_length,), dtype='int64', name='source_input_with_att'
    )
    # Target (decoder) ids: (batch, max_decoder_seq_length)
    target_input_with_att = tf.keras.layers.Input(
        shape=(max_decoder_seq_length,), dtype='int64', name='target_input_with_att'
    )

    # ----- Embeddings (IDs -> dense vectors) -----
    # TIP: consider mask_zero=True to automatically ignore PADs
    source_embedding_layer_with_att = tf.keras.layers.Embedding(
        input_dim=encode_vocab_size, output_dim=embed_dim, name='source_embedding_layer_with_att'
        # , mask_zero=True
    )
    target_embedding_layer_with_att = tf.keras.layers.Embedding(
        input_dim=decode_vocab_size, output_dim=embed_dim, name='target_embedding_layer_with_att'
        # , mask_zero=True
    )

    # Embedded sequences:
    # source_embeddings: (B, T_enc, D)
    source_embeddings_with_att = source_embedding_layer_with_att(source_input_with_att)
    # target_embeddings: (B, T_dec, D)
    target_embeddings_with_att = target_embedding_layer_with_att(target_input_with_att)

    # ----- Encoder LSTM -----
    # return_sequences=True: keep hidden states at every encoder step for attention keys/values
    # return_state=True    : also return final (h, c) to init the decoder
    encoder_lstm_layer_with_att = tf.keras.layers.LSTM(
        embed_dim, return_sequences=True, return_state=True, name='encoder_lstm_layer_with_att'
    )
    # encoder_out: (B, T_enc, D), encoder_state_h: (B, D), encoder_state_c: (B, D)
    encoder_out_with_att, encoder_state_h_with_att, encoder_state_c_with_att = \
        encoder_lstm_layer_with_att(source_embeddings_with_att)

    # ----- Decoder LSTM (teacher forcing) -----
    # Initialized with encoder final states so decoding is conditioned on the source
    decoder_lstm_layer_with_att = tf.keras.layers.LSTM(
        embed_dim, return_sequences=True, return_state=False, name='decoder_lstm_layer_with_att'
    )
    # decoder_lstm_out: (B, T_dec, D)  -> these are the "queries" for attention
    decoder_lstm_out_with_att = decoder_lstm_layer_with_att(
        target_embeddings_with_att, initial_state=[encoder_state_h_with_att, encoder_state_c_with_att]
    )

    # ----- Attention -----
    # Keras Attention: Attention([query, value]) where key=value by default.
    # query  : decoder time steps  (B, T_dec, D)
    # value  : encoder time steps  (B, T_enc, D)
    # output : context vectors aligned to each decoder step (B, T_dec, D)
    attention_context_vectors = tf.keras.layers.Attention(name='attention_layer')(
        [decoder_lstm_out_with_att, encoder_out_with_att]
    )

    # Concatenate decoder hidden state with its attention context per step
    # concat: (B, T_dec, 2D)
    concat_decode_out_with_att = tf.keras.layers.Concatenate(axis=-1, name='concat_layer_with_att')(
        [decoder_lstm_out_with_att, attention_context_vectors]
    )

    # ----- Time-distributed classification over target vocab -----
    # Dense applies to last dim -> (B, T_dec, V_tgt)
    target_classification_with_att = tf.keras.layers.Dense(
        decode_vocab_size, activation='softmax', name='classification_with_att'
    )(concat_decode_out_with_att)

    # ----- Assemble & compile -----
    translation_model_with_att = tf.keras.models.Model(
        inputs=[source_input_with_att, target_input_with_att],
        outputs=[target_classification_with_att]
    )

    translation_model_with_att.compile(
        optimizer="Adam",
        loss='sparse_categorical_crossentropy',  # labels are integer IDs shaped (B, T_dec)
        metrics=['accuracy']
    )

    return translation_model_with_att

translation_model_with_att = create_translation_model_with_att(encode_vocab_size, decode_vocab_size, embed_dim)

translation_model_with_att.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
==================================================================================================
 source_input_with_att (Inp  [(None, 6)]                  0         []                            
 utLayer)                                                                                         
                                                                                                  
 target_input_with_att (Inp  [(None, 13)]                 0         []                            
 utLayer)                                                                                         
                                                                                                  
 source_embedding_layer_wit  (None, 6, 100)               200200    ['source_input_with_att[0][0]'
 h_att (Embedding)                                                  ]                             
                                                                                                  
 target_embedding_layer_wit  (None, 13, 100)              300400    ['target_input_with_att[0][0]'
 h_att (Embedding)                                                  ]                             
                                                                                                  
 encoder_lstm_layer_with_at  [(None, 6, 100),             80400     ['source_embedding_layer_with_
 t (LSTM)                     (None, 100),                          att[0][0]']                   
                              (None, 100)]                                                        
                                                                                                  
 decoder_lstm_layer_with_at  (None, 13, 100)              80400     ['target_embedding_layer_with_
 t (LSTM)                                                           att[0][0]',                   
                                                                     'encoder_lstm_layer_with_att[
                                                                    0][1]',                       
                                                                     'encoder_lstm_layer_with_att[
                                                                    0][2]']                       
                                                                                                  
 attention_layer (Attention  (None, 13, 100)              0         ['decoder_lstm_layer_with_att[
 )                                                                  0][0]',                       
                                                                     'encoder_lstm_layer_with_att[
                                                                    0][0]']                       
                                                                                                  
 concat_layer_with_att (Con  (None, 13, 200)              0         ['decoder_lstm_layer_with_att[
 catenate)                                                          0][0]',                       
                                                                     'attention_layer[0][0]']     
                                                                                                  
 classification_with_att (D  (None, 13, 3004)             603804    ['concat_layer_with_att[0][0]'
 ense)                                                              ]                             
                                                                                                  
==================================================================================================
Total params: 1265204 (4.83 MB)
Trainable params: 1265204 (4.83 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________

translation_model_with_att.fit(x=[train_source_input_data, train_target_input_data],
                               y=train_target_output_data,
                               validation_data=([test_source_input_data, test_target_input_data],
                                                test_target_output_data),
                               epochs=40)

Epoch 1/40
251/251 [==============================] - 13s 37ms/step - loss: 2.3629 - accuracy: 0.6879 - val_loss: 1.5780 - val_accuracy: 0.7653
Epoch 2/40
251/251 [==============================] - 3s 12ms/step - loss: 1.4624 - accuracy: 0.7728 - val_loss: 1.4306 - val_accuracy: 0.7776
Epoch 3/40
251/251 [==============================] - 3s 10ms/step - loss: 1.3170 - accuracy: 0.7886 - val_loss: 1.3032 - val_accuracy: 0.7973
Epoch 4/40
251/251 [==============================] - 2s 10ms/step - loss: 1.1933 - accuracy: 0.8034 - val_loss: 1.2235 - val_accuracy: 0.8071
Epoch 5/40
251/251 [==============================] - 2s 9ms/step - loss: 1.1051 - accuracy: 0.8124 - val_loss: 1.1735 - val_accuracy: 0.8140
Epoch 6/40
251/251 [==============================] - 2s 10ms/step - loss: 1.0312 - accuracy: 0.8204 - val_loss: 1.1252 - val_accuracy: 0.8212
Epoch 7/40
251/251 [==============================] - 2s 10ms/step - loss: 0.9642 - accuracy: 0.8287 - val_loss: 1.0912 - val_accuracy: 0.8252
Epoch 8/40
251/251 [==============================] - 2s 8ms/step - loss: 0.9046 - accuracy: 0.8357 - val_loss: 1.0564 - val_accuracy: 0.8307
Epoch 9/40
251/251 [==============================] - 2s 8ms/step - loss: 0.8474 - accuracy: 0.8424 - val_loss: 1.0314 - val_accuracy: 0.8365
Epoch 10/40
251/251 [==============================] - 2s 8ms/step - loss: 0.7970 - accuracy: 0.8477 - val_loss: 1.0101 - val_accuracy: 0.8399
Epoch 11/40
251/251 [==============================] - 2s 8ms/step - loss: 0.7493 - accuracy: 0.8526 - val_loss: 0.9982 - val_accuracy: 0.8418
Epoch 12/40
251/251 [==============================] - 2s 8ms/step - loss: 0.7051 - accuracy: 0.8577 - val_loss: 0.9796 - val_accuracy: 0.8448
Epoch 13/40
251/251 [==============================] - 2s 7ms/step - loss: 0.6635 - accuracy: 0.8633 - val_loss: 0.9664 - val_accuracy: 0.8468
Epoch 14/40
251/251 [==============================] - 2s 7ms/step - loss: 0.6232 - accuracy: 0.8688 - val_loss: 0.9537 - val_accuracy: 0.8500
Epoch 15/40
251/251 [==============================] - 2s 8ms/step - loss: 0.5841 - accuracy: 0.8738 - val_loss: 0.9464 - val_accuracy: 0.8502
Epoch 16/40
251/251 [==============================] - 2s 7ms/step - loss: 0.5466 - accuracy: 0.8796 - val_loss: 0.9277 - val_accuracy: 0.8548
Epoch 17/40
251/251 [==============================] - 2s 8ms/step - loss: 0.5111 - accuracy: 0.8854 - val_loss: 0.9192 - val_accuracy: 0.8543
Epoch 18/40
251/251 [==============================] - 2s 7ms/step - loss: 0.4768 - accuracy: 0.8913 - val_loss: 0.9130 - val_accuracy: 0.8561
Epoch 19/40
251/251 [==============================] - 2s 7ms/step - loss: 0.4450 - accuracy: 0.8965 - val_loss: 0.9054 - val_accuracy: 0.8575
Epoch 20/40
251/251 [==============================] - 2s 7ms/step - loss: 0.4146 - accuracy: 0.9027 - val_loss: 0.8981 - val_accuracy: 0.8620
Epoch 21/40
251/251 [==============================] - 2s 8ms/step - loss: 0.3859 - accuracy: 0.9083 - val_loss: 0.8907 - val_accuracy: 0.8620
Epoch 22/40
251/251 [==============================] - 2s 7ms/step - loss: 0.3598 - accuracy: 0.9137 - val_loss: 0.8876 - val_accuracy: 0.8641
Epoch 23/40
251/251 [==============================] - 2s 8ms/step - loss: 0.3343 - accuracy: 0.9188 - val_loss: 0.8877 - val_accuracy: 0.8648
Epoch 24/40
251/251 [==============================] - 2s 7ms/step - loss: 0.3110 - accuracy: 0.9236 - val_loss: 0.8855 - val_accuracy: 0.8657
Epoch 25/40
251/251 [==============================] - 2s 7ms/step - loss: 0.2897 - accuracy: 0.9283 - val_loss: 0.8884 - val_accuracy: 0.8672
Epoch 26/40
251/251 [==============================] - 2s 7ms/step - loss: 0.2702 - accuracy: 0.9323 - val_loss: 0.8877 - val_accuracy: 0.8670
Epoch 27/40
251/251 [==============================] - 2s 8ms/step - loss: 0.2524 - accuracy: 0.9363 - val_loss: 0.8875 - val_accuracy: 0.8681
Epoch 28/40
251/251 [==============================] - 2s 7ms/step - loss: 0.2355 - accuracy: 0.9408 - val_loss: 0.8864 - val_accuracy: 0.8694
Epoch 29/40
251/251 [==============================] - 2s 8ms/step - loss: 0.2210 - accuracy: 0.9429 - val_loss: 0.8862 - val_accuracy: 0.8681
Epoch 30/40
251/251 [==============================] - 2s 7ms/step - loss: 0.2072 - accuracy: 0.9464 - val_loss: 0.8894 - val_accuracy: 0.8693
Epoch 31/40
251/251 [==============================] - 2s 7ms/step - loss: 0.1947 - accuracy: 0.9493 - val_loss: 0.8878 - val_accuracy: 0.8711
Epoch 32/40
251/251 [==============================] - 2s 8ms/step - loss: 0.1815 - accuracy: 0.9514 - val_loss: 0.8928 - val_accuracy: 0.8690
Epoch 33/40
251/251 [==============================] - 2s 7ms/step - loss: 0.1723 - accuracy: 0.9527 - val_loss: 0.8957 - val_accuracy: 0.8705
Epoch 34/40
251/251 [==============================] - 2s 7ms/step - loss: 0.1630 - accuracy: 0.9553 - val_loss: 0.9052 - val_accuracy: 0.8704
Epoch 35/40
251/251 [==============================] - 2s 7ms/step - loss: 0.1542 - accuracy: 0.9565 - val_loss: 0.9078 - val_accuracy: 0.8713
Epoch 36/40
251/251 [==============================] - 2s 8ms/step - loss: 0.1465 - accuracy: 0.9582 - val_loss: 0.9077 - val_accuracy: 0.8719
Epoch 37/40
251/251 [==============================] - 2s 7ms/step - loss: 0.1387 - accuracy: 0.9595 - val_loss: 0.9078 - val_accuracy: 0.8700
Epoch 38/40
251/251 [==============================] - 2s 8ms/step - loss: 0.1318 - accuracy: 0.9607 - val_loss: 0.9232 - val_accuracy: 0.8712
Epoch 39/40
251/251 [==============================] - 2s 7ms/step - loss: 0.1258 - accuracy: 0.9616 - val_loss: 0.9216 - val_accuracy: 0.8708
Epoch 40/40
251/251 [==============================] - 2s 7ms/step - loss: 0.1207 - accuracy: 0.9629 - val_loss: 0.9266 - val_accuracy: 0.8711

<tf_keras.src.callbacks.History at 0x7a1e3aa4cd10>

!pip install -U -q transformers accelerate

!pip install -q torchinfo

#import torch
from torchinfo import summary

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

t5_tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-large")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-large")

summary(t5_model)

================================================================================
Layer (type:depth-idx)                                  Param #
================================================================================
T5ForConditionalGeneration                              --
├─Embedding: 1-1                                        32,899,072
├─T5Stack: 1-2                                          32,899,072
│    └─Embedding: 2-1                                   (recursive)
│    └─ModuleList: 2-2                                  --
│    │    └─T5Block: 3-1                                12,585,472
│    │    └─T5Block: 3-2                                12,584,960
│    │    └─T5Block: 3-3                                12,584,960
│    │    └─T5Block: 3-4                                12,584,960
│    │    └─T5Block: 3-5                                12,584,960
│    │    └─T5Block: 3-6                                12,584,960
│    │    └─T5Block: 3-7                                12,584,960
│    │    └─T5Block: 3-8                                12,584,960
│    │    └─T5Block: 3-9                                12,584,960
│    │    └─T5Block: 3-10                               12,584,960
│    │    └─T5Block: 3-11                               12,584,960
│    │    └─T5Block: 3-12                               12,584,960
│    │    └─T5Block: 3-13                               12,584,960
│    │    └─T5Block: 3-14                               12,584,960
│    │    └─T5Block: 3-15                               12,584,960
│    │    └─T5Block: 3-16                               12,584,960
│    │    └─T5Block: 3-17                               12,584,960
│    │    └─T5Block: 3-18                               12,584,960
│    │    └─T5Block: 3-19                               12,584,960
│    │    └─T5Block: 3-20                               12,584,960
│    │    └─T5Block: 3-21                               12,584,960
│    │    └─T5Block: 3-22                               12,584,960
│    │    └─T5Block: 3-23                               12,584,960
│    │    └─T5Block: 3-24                               12,584,960
│    └─T5LayerNorm: 2-3                                 1,024
│    └─Dropout: 2-4                                     --
├─T5Stack: 1-3                                          32,899,072
│    └─Embedding: 2-5                                   (recursive)
│    └─ModuleList: 2-6                                  --
│    │    └─T5Block: 3-25                               16,780,800
│    │    └─T5Block: 3-26                               16,780,288
│    │    └─T5Block: 3-27                               16,780,288
│    │    └─T5Block: 3-28                               16,780,288
│    │    └─T5Block: 3-29                               16,780,288
│    │    └─T5Block: 3-30                               16,780,288
│    │    └─T5Block: 3-31                               16,780,288
│    │    └─T5Block: 3-32                               16,780,288
│    │    └─T5Block: 3-33                               16,780,288
│    │    └─T5Block: 3-34                               16,780,288
│    │    └─T5Block: 3-35                               16,780,288
│    │    └─T5Block: 3-36                               16,780,288
│    │    └─T5Block: 3-37                               16,780,288
│    │    └─T5Block: 3-38                               16,780,288
│    │    └─T5Block: 3-39                               16,780,288
│    │    └─T5Block: 3-40                               16,780,288
│    │    └─T5Block: 3-41                               16,780,288
│    │    └─T5Block: 3-42                               16,780,288
│    │    └─T5Block: 3-43                               16,780,288
│    │    └─T5Block: 3-44                               16,780,288
│    │    └─T5Block: 3-45                               16,780,288
│    │    └─T5Block: 3-46                               16,780,288
│    │    └─T5Block: 3-47                               16,780,288
│    │    └─T5Block: 3-48                               16,780,288
│    └─T5LayerNorm: 2-7                                 1,024
│    └─Dropout: 2-8                                     --
├─Linear: 1-4                                           32,899,072
================================================================================
Total params: 836,365,312
Trainable params: 836,365,312
Non-trainable params: 0
================================================================================

ARTICLE = ("Oh boy, what a lengthy and cumbersome excercise this was. " \
           "I had to look into every detail, check everything twice, " \
           " and then compare to prior results. Because of this tediousness " \
           " and extra work my homework was 2 days late.")

# Prefix the task so T5 knows what to do. T5 treats everything as "text → text".
# Common prefixes: "summarize:", "translate English to German:", "question:", etc.
t5_input_text = "summarize: " + ARTICLE

# Tokenize to model-ready tensors.
# - Wrapping in a list creates a batch of size 1.
# - return_tensors='pt' returns PyTorch tensors (use 'tf' for TensorFlow).
t5_inputs = t5_tokenizer([t5_input_text], return_tensors='pt')

# Generate summary token IDs from the input_ids.
# By default, .generate() will do *greedy decoding* (no sampling, no beam search)
# with the model’s built-in max_length. You can override behavior (see notes below).
t5_summary_ids = t5_model.generate(t5_inputs["input_ids"])

# Decode each generated sequence of IDs back to text.
# - skip_special_tokens=True removes tokens like <pad>, </s>
# - clean_up_tokenization_spaces=False preserves spacing exactly as produced
#   (set True if you want tokenizer to normalize spaces/punctuation)
print([
    t5_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    for g in t5_summary_ids
])

['homework was a lengthy and cumbersome excercise . because of this tediousness']

# Beam search generation with basic anti-repetition + length controls.
t5_summary_ids = t5_model.generate(
    t5_inputs['input_ids'],
    num_beams=3,            # Use 3 beams: explore top-3 continuations at each step
    no_repeat_ngram_size=3, # Forbid repeating any 3-word phrase (helps reduce loops)
    min_length=20,          # Do not stop before 20 tokens are generated
    max_length=40           # Hard cap on total generated length (incl. special tokens)
)

# Decode IDs -> strings.
print([
    t5_tokenizer.decode(
        g,
        skip_special_tokens=True,        # drop <pad>, </s>, etc.
        clean_up_tokenization_spaces=False  # keep raw spacing exactly
    )
    for g in t5_summary_ids
])

['i had to look into every detail, check everything twice, and then compare to prior results . because of this tediousness and extra work my homework was 2 days late .']

t5_input_text = "translate English to German: " + ARTICLE
t5_inputs = t5_tokenizer([t5_input_text], return_tensors='pt')

t5_summary_ids = t5_model.generate(t5_inputs['input_ids'],
                                   num_beams=3,
                                   no_repeat_ngram_size=3,
                                   min_length=10,
                                   max_length=40)

print([t5_tokenizer.decode(g, skip_special_tokens=True,
                           clean_up_tokenization_spaces=False) for g in t5_summary_ids])

['Ich habe es nicht geschafft, meinen ersten Test zu schreiben, da ich nicht genügend Zeit hatte, um meinen Test zu bearbeiten.']

t5_input_text = "translate English to German: That was really not very good today; it was too difficult to solve."
t5_inputs = t5_tokenizer([t5_input_text], return_tensors='pt')

t5_summary_ids = t5_model.generate(t5_inputs['input_ids'],
                                   num_beams=3,
                                   no_repeat_ngram_size=3,
                                   min_length=10,
                                   max_length=40)

print([t5_tokenizer.decode(g, skip_special_tokens=True,
                           clean_up_tokenization_spaces=False) for g in t5_summary_ids])

['Das war heute wirklich nicht sehr gut; es war zu schwierig zu lösen.']

#Delete the old model so we are managing memory
del t5_tokenizer
del t5_model

# Get a new model with a new checkpoint
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

t5_tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

"The <extra_id_0> is a small, intelligent dog breed known for its curly coat."

# Cloze-style prompt: T5 will try to fill the masked span <extra_id_0>.
PROMPT_SENTENCE = ("An Australian <extra_id_0> is a type of working dog .")

# Prepare a batch of size 1 for the model (PyTorch tensors).
t5_input_text = PROMPT_SENTENCE
t5_inputs = t5_tokenizer([t5_input_text], return_tensors='pt')

# Generate candidates with beam search.
t5_summary_ids = t5_model.generate(
    t5_inputs['input_ids'],
    num_beams=10,            # explore top-10 continuations at each step
    # temperature=0.8,       # only used if do_sample=True (sampling); ignored with pure beams
    no_repeat_ngram_size=2,  # avoid repeating any 2-gram (helps reduce loops)
    num_return_sequences=3,  # return the top-3 finished beams (≤ num_beams)
    min_length=1,            # ensure at least 1 token is generated
    max_length=3             # cap total generated tokens very short (good for a single word)
)

# Decode each candidate. Skipping special tokens removes <extra_id_x> markers in outputs.
print([
    t5_tokenizer.decode(
        g,
        skip_special_tokens=True,        # drop <pad>, </s>, <extra_id_x>, etc.
        clean_up_tokenization_spaces=False
    )
    for g in t5_summary_ids
])

['Shepherd', 'working', 'Working']

#Keep our memory free of old models
del t5_tokenizer
del t5_model

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")

prefix_prompt = 'Yesterday, I went to the store to buy '
input_ids = tokenizer.encode(prefix_prompt, return_tensors='pt')

max_new_tokens=35  # clearer: limits *generated* tokens only

# Stochastic text generation from a decoder-only LM (e.g., OPT/GPT-like)
generated_text_outputs = model.generate(
    input_ids,               # tokenized prefix prompt (batch or single sequence)
    max_length=35,           # TOTAL length cap (prefix+new tokens) for some models
                             # (Prefer max_new_tokens=... in recent HF)
    num_return_sequences=3,  # return 3 different samples (requires do_sample=True)
    repetition_penalty=1.5,  # >1.0 penalizes repeated tokens/phrases (helps reduce loops)
    top_p=0.92,              # nucleus sampling: keep smallest set with cum prob ≥ 0.92
    temperature=0.95,         # soften/sharpen logits (<1.0 = conservative, >1.0 = creative)
    do_sample=True,          # enable sampling (vs default greedy/beam search)
    top_k=125,               # truncate to top-125 tokens before sampling (used with top_p)
    early_stopping=True      # mainly relevant for beam search; with sampling it has little effect
)

# Print each generated continuation
for i, seq in enumerate(generated_text_outputs):
    print()
    print(f"{i}: {tokenizer.decode(seq, skip_special_tokens=True, clean_up_tokenization_spaces=True)}")

0: Yesterday, I went to the store to buy  a new bike. I was really excited because it didn't have anything else listed on Amazon but instead there were two bikes

1: Yesterday, I went to the store to buy ~~a new~~ used book. As a note-to kindle fanboy that has always liked using text on my

2: Yesterday, I went to the store to buy  - 1 pair of jeans (the length and width are different) - a pair of chinos from my normal shopping list

!pip install pydot --quiet
!pip install transformers --quiet
!pip install sentencepiece --quiet

from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
from transformers import AutoTokenizer

from pprint import pprint

checkpoint_string = "facebook/opt-1.3b"

from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(checkpoint_string)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_string)

/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(

inputs = tokenizer("You are a world renowned James Beard award winning pastry chef. Give us the recipe for your specialty, chocolate chip cookies. Only give us the ingredients and instructions.", return_tensors="pt")
outputs = model.generate(**inputs,
                         do_sample=True, min_length=100, max_length=300, temperature=0.97, repetition_penalty=1.2
            )
outputs.shape

torch.Size([1, 113])

pprint(tokenizer.batch_decode(outputs, skip_special_tokens=True),compact=True)

['You are a world renowned James Beard award winning pastry chef. Give us the '
 'recipe for your specialty, chocolate chip cookies. Only give us the '
 'ingredients and instructions.\n'
 'I don’t know why I never thought of this before! We just started going to a '
 'lot of weddings as a team so we can try out ideas like these that seem '
 'cool/worth working on!  I love how versatile cake frosting is, it has such '
 'great properties on its own too! Thanks for the suggestions! Will update '
 'when I get some good baking stuff together :)']

del tokenizer
del model

checkpoint_string = "google/flan-t5-large"


model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_string)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_string)

inputs = tokenizer("You are a world renowned James Beard award winning pastry chef. Give us the recipe for your specialty, chocolate chip cookies.   Only give us the ingredients and instructions.", return_tensors="pt")
outputs = model.generate(**inputs,
                         do_sample=True, min_length=100, max_length=300, temperature=0.97, repetition_penalty=1.2
            )
outputs.shape

torch.Size([1, 114])

pprint(tokenizer.batch_decode(outputs, skip_special_tokens=True),compact=True)

['the entire world of chocolate chip cookies is yours. all the ingredients '
 'listed for the recipe are included. just give us the recipe in text form. we '
 'will make you a new batch of cookies. all you need to do is add the '
 'remaining ingredients. we will add them to your cookies while we wait for '
 'the cookies. and then when they are ready, you will put them on the cookie '
 'tray. enjoy and please take pictures if you want to share your cookies and '
 'recipe. our contest ends on 7 December at 10:00 PM (GMT +3)']

del tokenizer
del model

checkpoint_string = "declare-lab/flan-alpaca-large"


model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_string)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_string)

/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(

inputs = tokenizer("You are a world renowned James Beard award winning pastry chef. Give us the recipe for your specialty, chocolate chip cookies.  Only give us the ingredients and instructions.", return_tensors="pt")
outputs = model.generate(**inputs,
                         do_sample=True, min_length=100, max_length=300, temperature=0.97, repetition_penalty=1.2
            )
outputs.shape

torch.Size([1, 119])

pprint(tokenizer.batch_decode(outputs, skip_special_tokens=True),compact=True) # This model is fine-tuned to follow the instructions - not bad!

['Ingredients: - 2 large eggs Beat 3 cups milk, 1 teaspoon vanilla extract - '
 '1/4 cup sugar - 3/4 cup butter Instructions: 1. Heat the milk in a large '
 'saucepan over medium heat. 2. Add the egg and stir to mix until combined. 3. '
 'Stir in the flour, baking soda and salt. 4. Slowly add the wet ingredients '
 'to the dry and beat together. 5. Drop spoonfuls of dough onto pre-greased '
 'cookie sheets. 6. Bake for 8-12 minutes or until edges are golden brown. 7. '
 'Allow cookies to cool before serving.']

!pip install -q -U transformers  #>=4.43.0
!pip install -q einops
!pip install -q -U accelerate  #>=0.31.0
!pip install -q -U bitsandbytes
!pip install -q -U  flash_attn

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.4/41.4 kB 3.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.0/12.0 MB 122.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.1/60.1 MB 45.1 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 137.9 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
  Building wheel for flash_attn (setup.py) ... done

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

from pprint import pprint

#In case we want to know our installed transformers library version
!pip list | grep transformers
!pip list | grep accelerate
!pip list | grep flash_attn

sentence-transformers                    5.1.1
transformers                             4.57.0
accelerate                               1.10.1
flash_attn                               2.8.3

#Quantization shrinks the memory footprint of the LLM
# allowing us to load it on a smaller GPU
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model_id = "QWen/Qwen3-4b"   #Try first, downloads and loads faster
#model_id = "QWen/Qwen3-14b" #You can try this much larger version and compare its performance. More parameters = better performance

pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"dtype": torch.bfloat16, "quantization_config": quantization_config},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a science communicator who makes technology accessible to everyone!"},
    {"role": "user", "content": "Please write a five sentence explanation of how LLMs do knowledge representation."},
]

outputs = pipeline(
    messages,
    max_new_tokens=512,
)

pprint(outputs[0]["generated_text"][-1], compact=True)

/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(

Device set to use cuda:0

{'content': '<think>\n'
            'Okay, the user wants a five-sentence explanation of how LLMs do '
            'knowledge representation. Let me start by recalling what I know '
            'about LLMs and knowledge representation.\n'
            '\n'
            'First, I need to explain that LLMs use large-scale data to learn '
            'patterns. Then, mention how they represent knowledge through '
            'training on diverse texts. Maybe talk about embeddings as a key '
            'part. Also, include how they use context to understand '
            'relationships. Finally, tie it to the ability to answer questions '
            'and generate text based on that representation.\n'
            '\n'
            'Wait, should I mention specific techniques like transformers or '
            'attention mechanisms? The user might not need the technical '
            'jargon, so maybe keep it high-level. Also, ensure each sentence '
            'is a separate point. Let me check if I have five clear points. \n'
            '\n'
            '1. LLMs learn from vast data. \n'
            '2. They use embeddings to represent concepts. \n'
            '3. Context helps in understanding relationships. \n'
            '4. They can answer questions using this. \n'
            '5. The representation allows for generating coherent text. \n'
            '\n'
            'That should cover it. Make sure each sentence is concise and '
            'flows logically. Avoid technical terms where possible. Maybe use '
            'examples like sentences or questions to make it relatable. '
            'Alright, let me put that together.\n'
            '</think>\n'
            '\n'
            'Large Language Models (LLMs) represent knowledge by learning '
            'patterns from vast amounts of text data during training. They '
            'encode information into dense numerical vectors (embeddings) that '
            'capture semantic relationships between words, phrases, and '
            'concepts. These embeddings are generated through self-attention '
            'mechanisms, allowing the model to contextualize meaning based on '
            'surrounding words. By analyzing patterns in training data, LLMs '
            'implicitly "understand" relationships between ideas, enabling '
            'them to answer questions or generate text that reflects this '
            'knowledge. This representation allows LLMs to reason about the '
            'world, though their understanding is based on statistical '
            'patterns rather than explicit logic.',
 'role': 'assistant'}

messages = [
    {"role": "user", "content": "What are the steps required for solving an 2x + 3 = 7 equation?"},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)

#lets set some values to have more control over the output
outputs = pipeline(
    prompt,
    max_new_tokens=2048,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
pprint(outputs[0]["generated_text"][len(prompt):], compact=True)

('<think>\n'
 'Okay, so I need to solve the equation 2x + 3 = 7. Let me think about how to '
 'approach this. I remember from algebra that the goal is to isolate the '
 'variable x on one side of the equation. Let me try to recall the steps.\n'
 '\n'
 "First, maybe I should get rid of that 3 that's being added to 2x. To do "
 'that, I can subtract 3 from both sides of the equation. That way, I keep the '
 'equation balanced. Let me try that:\n'
 '\n'
 'Original equation: 2x + 3 = 7\n'
 '\n'
 'Subtract 3 from both sides: 2x + 3 - 3 = 7 - 3\n'
 '\n'
 'Simplifying that: 2x = 4\n'
 '\n'
 'Okay, so now the equation is 2x = 4. Now I need to get x by itself. Since 2 '
 'is multiplied by x, I should divide both sides by 2 to undo the '
 'multiplication. Let me do that:\n'
 '\n'
 'Divide both sides by 2: (2x)/2 = 4/2\n'
 '\n'
 'Simplifying that: x = 2\n'
 '\n'
 'So, x equals 2. Let me check my answer to make sure. If I plug x = 2 back '
 'into the original equation:\n'
 '\n'
 'Left side: 2*(2) + 3 = 4 + 3 = 7\n'
 '\n'
 'Right side: 7\n'
 '\n'
 'Yes, both sides are equal. So x = 2 is correct.\n'
 '\n'
 "Wait, but let me make sure I didn't skip any steps. The equation was 2x + 3 "
 '= 7. First, subtract 3 from both sides to get 2x = 4. Then divide by 2 to '
 'get x = 2. That seems straightforward. Is there another way to approach '
 'this?\n'
 '\n'
 'Alternatively, maybe I could first divide both sides by 2 before subtracting '
 '3? Let me try that. Starting with 2x + 3 = 7.\n'
 '\n'
 'If I divide every term by 2: (2x)/2 + 3/2 = 7/2\n'
 '\n'
 'Which simplifies to x + 1.5 = 3.5\n'
 '\n'
 'Then subtract 1.5 from both sides: x = 3.5 - 1.5 = 2\n'
 '\n'
 'Same answer. So either way works. But usually, the standard method is to '
 'isolate the term with the variable first, so subtract the constant term '
 'first, then divide by the coefficient. That seems more straightforward.\n'
 '\n'
 'Another check: if I had done something wrong, like subtracting 2x first? '
 "Wait, that's not possible. The equation is 2x + 3 = 7. If I tried to "
 "subtract 2x, that would be 3 = 7 - 2x, which complicates things. So that's "
 'not helpful.\n'
 '\n'
 'Alternatively, maybe using inverse operations. The equation is 2x + 3 = 7. '
 'The operations are: multiply by 2, then add 3. To reverse that, we need to '
 'do the inverse operations in the reverse order. So first, subtract 3, then '
 'divide by 2. Which is exactly what I did before.\n'
 '\n'
 'So the steps are: subtract 3 from both sides, then divide both sides by 2. '
 'That gives x = 2.\n'
 '\n'
 "I think that's all. Let me make sure there's no other step. For example, "
 "sometimes people might try to factor or something, but in this case, it's a "
 'linear equation with one variable, so the standard steps apply. So the '
 'answer is x = 2. The steps are: subtract 3 from both sides, then divide by '
 '2.\n'
 '</think>\n'
 '\n'
 'To solve the equation $2x + 3 = 7$, follow these steps:\n'
 '\n'
 '1. **Subtract 3 from both sides** to isolate the term with the variable:  \n'
 '   $$\n'
 '   2x + 3 - 3 = 7 - 3 \\implies 2x = 4\n'
 '   $$\n'
 '\n'
 '2. **Divide both sides by 2** to solve for $x$:  \n'
 '   $$\n'
 '   \\frac{2x}{2} = \\frac{4}{2} \\implies x = 2\n'
 '   $$\n'
 '\n'
 '**Verification**:  \n'
 'Substitute $x = 2$ back into the original equation:  \n'
 '$$\n'
 '2(2) + 3 = 4 + 3 = 7\n'
 '$$  \n'
 'The left side equals the right side, confirming the solution is correct.\n'
 '\n'
 '**Final Answer**:  \n'
 '$$\n'
 '\\boxed{2}\n'
 '$$')

messages = [
    {"role": "user", "content": "You are a world renowned baker with many awards and Michelin stars.  Give us your world famous recipe for chocolate chip cookies."},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True # Switches between thinking and non-thinking modes. Default is True. Very good performance when thinking = True
)


#lets set some values to have more control over the output
outputs = pipeline(
    prompt,
    max_new_tokens=2048,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
pprint(outputs[0]["generated_text"][len(prompt):], compact=True)

('<think>\n'
 'Okay, the user wants a world-famous chocolate chip cookie recipe from a '
 'renowned baker with Michelin stars. Let me think about how to approach '
 'this.\n'
 '\n'
 'First, I need to make sure the recipe is not just a regular one but '
 'something that would earn Michelin stars. That means it should have '
 'exceptional ingredients, precise measurements, and maybe some unique '
 'techniques. \n'
 '\n'
 'I should start by recalling the key elements of a great chocolate chip '
 'cookie. The texture is crucial—crispy edges and a gooey center. The '
 'chocolate should be high quality, maybe using a specific type like semisweet '
 'or a blend. \n'
 '\n'
 'The ingredients list needs to be detailed. Maybe include things like '
 'buttermilk for tenderness, or a touch of vanilla for flavor. Also, the dough '
 'should be perfectly balanced between flaky and chewy. \n'
 '\n'
 'Wait, the user mentioned "world famous," so maybe there\'s a signature '
 'element. Perhaps a unique ingredient like a special type of chocolate, or a '
 'technique like using a combination of sugars for texture. Also, the baking '
 'temperature and time are important. Maybe a specific temperature and a '
 'longer baking time to achieve that perfect crust.\n'
 '\n'
 'I should also think about the presentation. Maybe the cookies are made in a '
 'way that they look elegant, like a decorative shape or a specific baking '
 'method. Also, the recipe should be precise, with exact measurements. \n'
 '\n'
 "Let me check some top bakers' recipes. For example, a baker might use a "
 'higher percentage of butter for a flakier texture. Also, using a combination '
 'of brown and white sugar for a caramelized effect. Maybe the dough is rolled '
 'out and cut into shapes, then baked to a specific temperature. \n'
 '\n'
 'Also, the chocolate chips should be high quality, maybe using a specific '
 'brand or type. Maybe a blend of dark and milk chocolate for depth of '
 'flavor. \n'
 '\n'
 "The user might also want the recipe to be something that's not just a "
 'standard recipe but has a story or a unique twist. Maybe a signature element '
 'like a "chocolate crumb" or a specific baking technique. \n'
 '\n'
 'I need to structure the recipe with ingredients and instructions. The '
 'ingredients should be detailed, with specific measurements. The instructions '
 'should be clear, step-by-step. Also, maybe include some tips for the perfect '
 'result, like chilling the dough or using a specific type of chocolate. \n'
 '\n'
 "Wait, the user is asking for a world-famous recipe, so maybe it's inspired "
 'by a Michelin-starred restaurant. The name could be something like "The '
 'Crisp & Crumble" or something similar. \n'
 '\n'
 'Also, the recipe should have a unique selling point. Maybe the cookies are '
 'made with a specific type of flour, or a unique method of mixing the '
 'dough. \n'
 '\n'
 'I should also consider the balance between the ingredients. For example, the '
 'ratio of butter to flour, the type of sugar, the role of baking powder or '
 'soda. \n'
 '\n'
 'Let me outline the ingredients first. The base ingredients would be flour, '
 'butter, sugar, egg, baking powder, buttermilk, vanilla, and chocolate chips. '
 'Maybe some additional ingredients like a touch of salt or a specific type of '
 'chocolate. \n'
 '\n'
 'For the instructions, the process would involve creaming the butter and '
 'sugar, adding egg, then the dry ingredients, mixing in buttermilk, then '
 'folding in the chocolate chips. Then chilling, shaping, and baking. \n'
 '\n'
 'I need to make sure the steps are clear and the measurements are precise. '
 'Also, the baking temperature and time. Maybe 350°F (175°C) for 12-14 '
 'minutes. \n'
 '\n'
 'I should also add some tips, like using a higher quality chocolate, chilling '
 'the dough, or using a specific type of baking sheet. \n'
 '\n'
 'Wait, the user might also want the recipe to have a unique twist. Maybe a '
 'hint of cinnamon or a different type of spice. Or maybe a unique shape, like '
 'a decorative cookie. \n'
 '\n'
 'Also, the texture is important. The dough should be soft but not too sticky. '
 'Maybe the dough is rolled out and cut into shapes, then baked. \n'
 '\n'
 "I think that's a good start. Now, I need to structure all this into a recipe "
 "that's both detailed and elegant, with the right balance of ingredients and "
 'steps. Make sure to mention the quality of ingredients, the precise '
 'measurements, and the techniques that make the cookies exceptional.\n'
 '</think>\n'
 '\n'
 '**World-Famous Chocolate Chip Cookies: "The Crisp & Crumble"**  \n'
 '*Inspired by the artistry of Michelin-starred pastry chefs, this recipe '
 'balances indulgence with precision, offering a perfect harmony of crisp '
 'edges and gooey centers.*  \n'
 '\n'
 '---\n'
 '\n'
 '### **Ingredients**  \n'
 '*(Serves 24 cookies; adjust as needed)*  \n'
 '\n'
 '**For the Dough:**  \n'
 '- 1 ½ cups (190g) all-purpose flour (preferably from a premium brand)  \n'
 '- 1 cup (220g) unsalted butter, cold and cubed (for flaky texture)  \n'
 '- 1 cup (200g) granulated sugar  \n'
 '- ½ cup (100g) brown sugar (for caramel depth)  \n'
 '- 1 large egg, at room temperature  \n'
 '- 1 tsp baking powder  \n'
 '- 1 tsp salt  \n'
 '- 1 cup (240ml) buttermilk (whipped for tenderness)  \n'
 '- 1 tsp vanilla extract  \n'
 '- 1 tsp ground cinnamon (for warmth)  \n'
 '\n'
 '**For the Chocolate Crumb:**  \n'
 '- 1 ½ cups (180g) semisweet chocolate chips (dark, 60% cocoa, for depth)  \n'
 '- 1 cup (120g) chopped nuts (walnuts or pecans, for texture)  \n'
 '\n'
 '---\n'
 '\n'
 '### **Instructions**  \n'
 '\n'
 '**1. Prepare the Dough:**  \n'
 '- In a large bowl, mix the flour, baking powder, salt, and cinnamon.  \n'
 '- In a separate bowl, cream the cold butter, granulated sugar, and brown '
 'sugar until light and fluffy.  \n'
 '- Add the egg, buttermilk, and vanilla. Mix until just combined.  \n'
 '- Gradually add the dry ingredients, stirring until a soft dough forms.  \n'
 '\n'
 '**2. Fold in Chocolate Crumb:**  \n'
 '- In a small bowl, combine the chocolate chips and nuts.  \n'
 '- Fold this mixture into the dough, ensuring even distribution.  \n'
 '\n'
 '**3. Chill and Shape:**  \n'
 '- Wrap the dough in plastic wrap and chill for 1 hour (or refrigerate '
 'overnight).  \n'
 '- Roll the dough into 1-inch balls (about 1.5 tbsp per ball).  \n'
 '- Press a small spoonful of the chocolate crumb mixture into each ball, '
 'creating a "crumb" center.  \n'
 '\n'
 '**4. Bake to Perfection:**  \n'
 '- Preheat oven to 350°F (175°C). Line a baking sheet with parchment '
 'paper.  \n'
 '- Place the dough balls 2 inches apart on the sheet.  \n'
 '- Bake for 12–14 minutes, until the edges are golden and the centers are '
 'just set.  \n'
 '\n'
 '**5. Cool and Serve:**  \n'
 '- Let the cookies cool completely on the sheet.  \n'
 '- Store in an airtight container for up to 5 days, or freeze for up to 2 '
 'months.  \n'
 '\n'
 '---\n'
 '\n'
 '### **Secrets to Michelin-Style Perfection**  \n'
 '- **Butter Precision:** Use cold, cubed butter for flaky layers.  \n'
 '- **Buttermilk Magic:** Whipped buttermilk adds tenderness and a subtle '
 'tang.  \n'
 '- **Chocolate Crumb:** A blend of dark chocolate and nuts creates a rich, '
 'textural contrast.  \n'
 '- **Chilling:** Chill the dough to firm the butter, ensuring a crisp '
 'edge.  \n'
 '\n'
 '---\n'
 '\n'
 '### **Why This Recipe Stands Out**  \n'
 '- **Texture:** The balance of butter, sugar, and crumb creates a "crisp & '
 'crumble" contrast.  \n'
 '- **Flavor:** Cinnamon and buttermilk elevate the chocolate, while nuts add '
 'depth.  \n'
 '- **Presentation:** The cookies are baked to a golden crust, with a gooey '
 'center that melts on the tongue.  \n'
 '\n'
 '*Serve warm with a scoop of velvety vanilla ice cream, or as a decadent '
 'dessert for any occasion.*  \n'
 '\n'
 '---  \n'
 '*Inspired by the culinary artistry of Michelin-starred pastry chefs, this '
 'recipe is a tribute to the timeless joy of chocolate, crafted with precision '
 'and passion.* 🍪🍫')

# model_id = "QWen/Qwen3-4b"   #Try first, downloads and loads faster
model_id = "QWen/Qwen3-14b" #You can try this much larger version and compare its performance. More parameters = better performance

pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"dtype": torch.bfloat16, "quantization_config": quantization_config},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a science communicator who makes technology accessible to everyone!"},
    {"role": "user", "content": "Please write a five sentence explanation of how LLMs do knowledge representation."},
]

outputs = pipeline(
    messages,
    max_new_tokens=512,
)

pprint(outputs[0]["generated_text"][-1], compact=True)

/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(

Device set to use cuda:0

{'content': '<think>\n'
            'Okay, the user wants a five-sentence explanation of how LLMs do '
            'knowledge representation. Let me start by recalling what I know '
            'about LLMs. They use neural networks, right? So the first point '
            'should be about the architecture, like transformer models with '
            'attention mechanisms.\n'
            '\n'
            "Next, knowledge representation in LLMs isn't explicit like "
            'databases. Instead, they encode information in the weights of the '
            "model. That's important because it's different from traditional "
            'methods.\n'
            '\n'
            'Then, I should mention how training on vast text data allows them '
            'to capture patterns and relationships between concepts. This '
            'helps in understanding context and generating coherent '
            'responses.\n'
            '\n'
            'Also, the distributed nature of the embeddings is key. Words and '
            'concepts are represented as vectors in a high-dimensional space, '
            'capturing semantic meaning through their proximity to other '
            'vectors.\n'
            '\n'
            'Finally, the fifth sentence should tie it all together, '
            "explaining that while they don't store facts explicitly, the "
            "model's parameters implicitly encode a vast amount of structured "
            'knowledge, which it can retrieve and use during inference.\n'
            '\n'
            'Wait, does that cover all aspects? Let me check each sentence. '
            'First, architecture. Second, implicit encoding. Third, training '
            'data and patterns. Fourth, embeddings and vectors. Fifth, '
            'structured knowledge in parameters. Yeah, that seems to cover the '
            "main points without being too technical. Make sure it's clear and "
            'concise.\n'
            '</think>\n'
            '\n'
            'Large Language Models (LLMs) represent knowledge implicitly '
            'through vast neural networks, where patterns from training data '
            "are encoded in the model's parameters. Instead of storing facts "
            'explicitly, they learn relationships between concepts by '
            'analyzing massive text corpora, allowing them to infer '
            'connections during inference. This knowledge is represented as '
            'distributed embeddings—high-dimensional vectors that capture '
            'semantic meaning, context, and relational information. Attention '
            'mechanisms enable the model to dynamically prioritize relevant '
            'parts of the input or stored knowledge, mimicking human-like '
            'reasoning. While LLMs don’t "store" information like databases, '
            'their trained weights effectively summarize and organize '
            'knowledge in a way that can be retrieved and applied '
            'contextually.',
 'role': 'assistant'}

messages = [
    {"role": "user", "content": "What are the steps required for solving an 2x + 3 = 7 equation?"},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)

#lets set some values to have more control over the output
outputs = pipeline(
    prompt,
    max_new_tokens=2048,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
pprint(outputs[0]["generated_text"][len(prompt):], compact=True)

('<think>\n'
 'Okay, so I need to solve the equation 2x + 3 = 7. Hmm, let me think. I '
 'remember from algebra that the goal is to isolate the variable, which in '
 "this case is x. So, I need to get rid of the 3 that's being added to 2x, and "
 "then deal with the coefficient 2 that's multiplied by x. \n"
 '\n'
 'First step, maybe I should subtract 3 from both sides of the equation to '
 'undo the addition. Let me try that. If I subtract 3 from both sides, it '
 'would look like:\n'
 '\n'
 '2x + 3 - 3 = 7 - 3\n'
 '\n'
 'Simplifying both sides, the +3 and -3 on the left side cancel each other '
 'out, leaving just 2x. On the right side, 7 minus 3 is 4. So now the equation '
 'is 2x = 4. That seems right.\n'
 '\n'
 'Now, the next step is to get x by itself. Since 2 is multiplied by x, I need '
 'to do the opposite operation, which is division. So, I should divide both '
 'sides of the equation by 2. Let me write that down:\n'
 '\n'
 '2x / 2 = 4 / 2\n'
 '\n'
 'On the left side, the 2 in the numerator and the 2 in the denominator cancel '
 'each other out, leaving just x. On the right side, 4 divided by 2 is 2. So, '
 'x equals 2. \n'
 '\n'
 "Wait, let me check if that's correct. If I substitute x = 2 back into the "
 "original equation, does it hold true? Let's see:\n"
 '\n'
 'Original equation: 2x + 3 = 7\n'
 '\n'
 'Substitute x with 2: 2*(2) + 3 = 4 + 3 = 7. \n'
 '\n'
 'Yes, that equals 7, which matches the right side of the equation. So, x = 2 '
 'is indeed the correct solution. \n'
 '\n'
 "But let me go through the steps again to make sure I didn't skip anything. "
 'Starting with 2x + 3 = 7. Subtract 3 from both sides: 2x = 4. Then divide by '
 '2: x = 2. Yep, that seems straightforward. \n'
 '\n'
 "I wonder if there's another way to approach this problem? Maybe by dividing "
 'first? Let me try that. If I have 2x + 3 = 7, and I try to divide everything '
 'by 2 first, that would give me x + 3/2 = 7/2. Then subtract 3/2 from both '
 'sides: x = 7/2 - 3/2 = 4/2 = 2. Oh, same result. So either method works, but '
 "subtracting 3 first seems more intuitive for me because it's simpler to deal "
 'with integers rather than fractions. \n'
 '\n'
 'Another thing to consider: what if I made a mistake in the subtraction step? '
 "Let me verify. If I subtract 3 from 7, it's 4. That's straightforward. If I "
 'had messed up that step, say, subtracted 2 instead of 3, I would have gotten '
 '2x = 5, leading to x = 2.5, which would be wrong. But since I checked my '
 'answer by plugging it back in, I can be confident that the solution is '
 'correct. \n'
 '\n'
 'Also, thinking about the properties of equality: whatever operation I '
 'perform on one side of the equation, I must perform on the other side to '
 'maintain equality. So subtracting 3 from both sides and dividing both sides '
 'by 2 are both valid operations. \n'
 '\n'
 'Is there a possibility that there could be no solution or infinite solutions '
 'here? Well, since this is a linear equation with degree 1, there should be '
 'exactly one solution, which is x = 2. If the equation had been something '
 'like 2x + 3 = 2x + 5, then subtracting 2x from both sides would lead to 3 = '
 '5, which is impossible, meaning no solution. But in this case, the '
 "coefficients are set up so that there's a unique solution. \n"
 '\n'
 'Just to recap the steps clearly:\n'
 '\n'
 '1. Start with the original equation: 2x + 3 = 7.\n'
 '2. Subtract 3 from both sides to isolate the term with the variable: 2x = 7 '
 '- 3.\n'
 '3. Simplify the right side: 2x = 4.\n'
 '4. Divide both sides by 2 to solve for x: x = 4 / 2.\n'
 '5. Simplify to get the solution: x = 2.\n'
 '6. Verify the solution by substituting back into the original equation.\n'
 '\n'
 'Each step follows logically from the previous one, and the verification step '
 "ensures that the solution is correct. I don't see any errors in this "
 "process. It's important to perform each operation carefully and check the "
 'work to avoid simple arithmetic mistakes, which are common when solving '
 'equations. \n'
 '\n'
 'Another thing I might consider is if the equation was written correctly. For '
 'example, if there was a typo or misread the original problem, but assuming '
 'the equation is 2x + 3 = 7, then the steps are as outlined. If the equation '
 'were different, say 2x + 3 = 7x, the approach would be different, but in '
 "this case, it's straightforward. \n"
 '\n'
 'In summary, solving a linear equation like this involves inverse operations '
 'to isolate the variable. By reversing the operations applied to the variable '
 '(first subtraction, then division), we can find the value of x that '
 'satisfies the equation. Checking the solution by substitution is a good '
 'practice to confirm the result.\n'
 '</think>\n'
 '\n'
 'To solve the equation $2x + 3 = 7$, follow these steps:\n'
 '\n'
 '1. **Subtract 3 from both sides** to isolate the term with the variable:\n'
 '   $$\n'
 '   2x + 3 - 3 = 7 - 3\n'
 '   $$\n'
 '   Simplifying both sides:\n'
 '   $$\n'
 '   2x = 4\n'
 '   $$\n'
 '\n'
 '2. **Divide both sides by 2** to solve for $x$:\n'
 '   $$\n'
 '   \\frac{2x}{2} = \\frac{4}{2}\n'
 '   $$\n'
 '   Simplifying:\n'
 '   $$\n'
 '   x = 2\n'
 '   $$\n'
 '\n'
 '3. **Verify the solution** by substituting $x = 2$ back into the original '
 'equation:\n'
 '   $$\n'
 '   2(2) + 3 = 4 + 3 = 7\n'
 '   $$\n'
 '   Since both sides are equal, the solution is confirmed.\n'
 '\n'
 '**Final Answer:**  \n'
 '$$\n'
 '\\boxed{2}\n'
 '$$')

messages = [
    {"role": "user", "content": "You are a world renowned baker with many awards and Michelin stars.  Give us your world famous recipe for chocolate chip cookies."},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True # Switches between thinking and non-thinking modes. Default is True. Very good performance when thinking = True
)


#lets set some values to have more control over the output, QWEN 14B generates better result than 4B and better than GPT3.5
outputs = pipeline(        # e.g., a text-generation or chat pipeline already constructed
    prompt,                # the input string (your instruction / few-shot prefix)
    max_new_tokens=2048,   # cap on *newly generated* tokens (doesn't count prompt tokens)
    do_sample=True,        # enable sampling (vs. greedy/beam); needed for temperature/top_p
    temperature=0.6,       # lower = more focused/deterministic, higher = more creative
    top_p=0.9,             # nucleus sampling: sample from smallest token set with cum prob ≥ 0.9
)
pprint(outputs[0]["generated_text"][len(prompt):], compact=True)

('<think>\n'
 'Okay, the user wants my world-famous chocolate chip cookie recipe. Let me '
 'start by recalling the key elements that make a great cookie. First, the '
 'dough needs to be rich and buttery. I should use high-quality unsalted '
 'butter, maybe even European-style for better flavor. Brown sugar adds '
 'moisture and a deeper molasses flavor compared to white sugar. The ratio of '
 'sugars is important—maybe 2:1 brown to white?\n'
 '\n'
 'Next, the flour. I prefer all-purpose flour, but maybe a touch of bread '
 'flour for a bit more structure without making them tough. Baking soda is '
 'essential for leavening, and a pinch of salt to balance the sweetness. \n'
 '\n'
 'For the chocolate, using high-quality semi-sweet chocolate chips is a must. '
 'Maybe add some chopped chocolate for texture variation. The key is to melt '
 "the chocolate just enough so it's glossy but not oily. \n"
 '\n'
 'The dough needs to rest in the fridge. This helps the gluten relax, leading '
 'to a better texture. When baking, the temperature is crucial. I usually '
 'preheat the oven to 350°F (175°C) and bake for 10-12 minutes. The cookies '
 'should be golden around the edges but still soft in the center. \n'
 '\n'
 'Oh, and the "pillow" texture—achieved by not overbaking. Maybe mention the '
 '"crack" on the surface from the sugar. Also, the recipe should include a '
 'note about the cookies spreading, so maybe suggest using parchment paper and '
 'spacing them out. \n'
 '\n'
 'Wait, should I include any special techniques? Like the "resting" of the '
 'dough, or maybe a specific type of chocolate? Also, maybe a tip about using '
 'a silicone spatula when mixing to avoid overmixing. \n'
 '\n'
 'Let me check the measurements. 2 3/4 cups flour, 1 tsp baking soda, 1 tsp '
 'salt. Wait, that seems like a lot of salt. Maybe 1/2 tsp? No, the original '
 'Toll House recipe uses 1 tsp. But maybe adjust it slightly for a more '
 'balanced flavor. \n'
 '\n'
 'Also, the eggs—should they be at room temperature? Maybe mention that for '
 'better emulsification. And the vanilla extract, using pure vanilla for '
 'better aroma. \n'
 '\n'
 'I should also think about the presentation. Maybe suggest a dusting of sea '
 'salt on top for contrast. And the serving suggestion—warm with a glass of '
 'milk. \n'
 '\n'
 'Wait, the user mentioned Michelin stars, so the recipe needs to be precise. '
 'Maybe include exact measurements and temperatures. Also, any variations? '
 "Like gluten-free or dairy-free options? But maybe that's beyond the scope "
 'unless the user asks. \n'
 '\n'
 'Double-check the steps: cream butter and sugars, add eggs and vanilla, mix '
 'in dry ingredients, fold in chocolate, chill, scoop, bake. Yes. \n'
 '\n'
 'I should also mention the "magic" of the cookies spreading and the perfect '
 'balance between crispy edges and soft center. Maybe add a note about the '
 'origin of the recipe, inspired by Ruth Wakefield but with some personal '
 'twists. \n'
 '\n'
 'Alright, time to put it all together in a clear, step-by-step format with '
 "measurements and tips. Make sure it's easy to follow and sounds professional "
 'yet approachable.\n'
 '</think>\n'
 '\n'
 '**World-Famous Chocolate Chip Cookies**  \n'
 '*Inspired by tradition, perfected with precision—crafted for a '
 'Michelin-starred touch.*  \n'
 '\n'
 '---\n'
 '\n'
 '### **Ingredients**  \n'
 '*(Makes 24–30 cookies, depending on size)*  \n'
 '\n'
 '**For the Dough:**  \n'
 '- **Unsalted butter** (European-style, 225g / 1 cup, softened)  \n'
 '- **Brown sugar** (packed, 200g / 1 1/4 cups)  \n'
 '- **Granulated sugar** (100g / 1/2 cup)  \n'
 '- **Eggs** (2 large, at room temperature)  \n'
 '- **Vanilla extract** (2 tsp, pure)  \n'
 '- **All-purpose flour** (270g / 2 3/4 cups)  \n'
 '- **Baking soda** (1 tsp)  \n'
 '- **Salt** (1/2 tsp fine sea salt, plus a pinch for baking)  \n'
 '- **Semi-sweet chocolate chips** (200g / 1 3/4 cups, high-quality)  \n'
 '- **Chopped dark chocolate** (optional, 50g / 1/3 cup, for texture '
 'contrast)  \n'
 '\n'
 '---\n'
 '\n'
 '### **Instructions**  \n'
 '\n'
 '1. **Prep the Oven**  \n'
 '   Preheat oven to **350°F (175°C)**. Line baking sheets with parchment '
 'paper.  \n'
 '\n'
 '2. **Cream the Butter and Sugars**  \n'
 '   In a large bowl, use a stand mixer or handheld whisk to **cream the '
 'butter, brown sugar, and granulated sugar** until light and fluffy (≈3–4 '
 'minutes).  \n'
 '\n'
 '3. **Add Eggs and Vanilla**  \n'
 '   Beat in eggs one at a time, ensuring each is fully incorporated. Scrape '
 'down the bowl, then add vanilla extract.  \n'
 '\n'
 '4. **Incorporate Dry Ingredients**  \n'
 '   Whisk flour, baking soda, and salt in a separate bowl. Fold dry '
 'ingredients into the wet mixture **on low speed** until just combined (avoid '
 'overmixing).  \n'
 '\n'
 '5. **Fold in Chocolate**  \n'
 '   Gently stir in chocolate chips and chopped chocolate with a silicone '
 'spatula. Cover and **chill dough for 1 hour** (or freeze for 30 minutes) to '
 'firm up, preventing spreading.  \n'
 '\n'
 '6. **Scoop and Bake**  \n'
 '   Drop rounded tablespoons of dough onto prepared sheets (spaced 2–3 inches '
 'apart). Bake for **10–12 minutes**, until edges are golden but centers '
 'remain soft. Let rest on sheets for 2 minutes before transferring to a wire '
 'rack.  \n'
 '\n'
 '7. **Serve Warm**  \n'
 '   Enjoy with a glass of cold milk or a scoop of vanilla ice cream.  \n'
 '\n'
 '---\n'
 '\n'
 '### **Pro Tips for Perfection**  \n'
 '- **Butter Temperature:** Softened butter (room temperature) ensures a silky '
 'texture.  \n'
 '- **Chocolate Quality:** Use couverture chocolate for a richer, less waxy '
 'finish.  \n'
 '- **Resting the Dough:** Chilling enhances flavor and prevents spread.  \n'
 '- **Bake Time:** Resist the urge to overbake—cookies should be "soft and '
 'chewy" with a crisp edge.  \n'
 '- **Sea Salt Finish:** Sprinkle flaky sea salt on top before baking for a '
 'gourmet touch.  \n'
 '\n'
 '---\n'
 '\n'
 '### **The Secret to "Pillow-Soft" Cookies**  \n'
 'The magic lies in the balance of sugars (brown for moisture, white for '
 'structure), the gentle folding of ingredients, and precise baking. The '
 'result? A cookie that’s crispy on the outside, chewy in the center, and '
 '*unforgettable*.  \n'
 '\n'
 '*Bon appétit, and may your oven be forever filled with the scent of butter '
 'and chocolate.* 🍪✨  \n'
 '\n'
 '---  \n'
 '*Chef’s Note: Adjust baking time slightly if using a convection oven. For '
 'gluten-free, substitute flour blend and ensure chocolate is gluten-free '
 'certified.*')

Text Generation¶

Notebook Contents¶

1. Setup¶

2. Building a Seq2Seq model for Translation using RNNs with and without Attention¶

2.1 Downloading and pre-processing Data¶

What `max_features` does here¶

Why this matters downstream¶

Creating Train/Test Data for the Seq2Seq Models¶

ID scheme¶

What we expect each tensor to look like¶

Alignment check (first row, conceptually)¶

2.2 The Seq2seq model without Attention¶

2.3 The Seq2seq model with Attention¶

3. T5 (Text-to-Text Transfer Transformer)¶

4. Prompt Engineering & Generative Large Language Models¶

Core Concepts¶

Two Prompting Families¶

1) Cloze (Mask) Prompts¶

2) Prefix (Next-Token) Prompts¶

Zero-Shot Instruction-Tuned Models¶

Practical Implications¶

4.1 Cloze Prompts¶

T5 Sentinel Tokens (Masked Word Modeling)¶

Example Prompt Pattern¶

Decoding Strategy (Beam Search)¶

4.2 Prefix Prompts¶

How to Use Prefix Prompts¶

Example Model: OPT (Decoder-Only)¶

Parameter cheat sheet¶

Practical tips¶

4.3 Instruction-Tuned Reasoning Prompts — Qwen 3 (4B vs 14B)¶

Why it’s notable¶

Model sizes & practical trade-offs¶

Qwen 3–4B (quantized)¶

Qwen 3–14B¶

Tips for running within memory limits¶

Quick setup checklist¶

4.4 Chat GPT output¶

Text Generation¶

Notebook Contents¶

1. Setup¶

2. Building a Seq2Seq model for Translation using RNNs with and without Attention¶

2.1 Downloading and pre-processing Data¶

What max_features does here¶

Why this matters downstream¶

Creating Train/Test Data for the Seq2Seq Models¶

ID scheme¶

What we expect each tensor to look like¶

Alignment check (first row, conceptually)¶

2.2 The Seq2seq model without Attention¶

2.3 The Seq2seq model with Attention¶

3. T5 (Text-to-Text Transfer Transformer)¶

4. Prompt Engineering & Generative Large Language Models¶

Core Concepts¶

Two Prompting Families¶

1) Cloze (Mask) Prompts¶

2) Prefix (Next-Token) Prompts¶

Zero-Shot Instruction-Tuned Models¶

Practical Implications¶

4.1 Cloze Prompts¶

T5 Sentinel Tokens (Masked Word Modeling)¶

Example Prompt Pattern¶

Decoding Strategy (Beam Search)¶

4.2 Prefix Prompts¶

How to Use Prefix Prompts¶

Example Model: OPT (Decoder-Only)¶

Parameter cheat sheet¶

Practical tips¶

4.3 Instruction-Tuned Reasoning Prompts — Qwen 3 (4B vs 14B)¶

Why it’s notable¶

Model sizes & practical trade-offs¶

Qwen 3–4B (quantized)¶

Qwen 3–14B¶

Tips for running within memory limits¶

Quick setup checklist¶

4.4 Chat GPT output¶

What `max_features` does here¶