Introduction
Transformers revolutionized NLP with self-attention mechanism for processing sequences.
Attention Mechanism
import tensorflow as tf
from tensorflow.keras import layers
class AttentionLayer(layers.Layer):
def __init__(self):
super().__init__()
def call(self, query, key, value):
scores = tf.matmul(query, key, transpose_b=True)
scores = scores / tf.math.sqrt(tf.cast(tf.shape(key)[-1], tf.float32))
attention_weights = tf.nn.softmax(scores, axis=-1)
return tf.matmul(attention_weights, value)
Transformer Block
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim):
super().__init__()
self.attention = layers.MultiHeadAttention(num_heads, embed_dim)
self.ffn = keras.Sequential([
layers.Dense(ff_dim, activation="relu"),
layers.Dense(embed_dim)
])
self.layernorm1 = layers.LayerNormalization()
self.layernorm2 = layers.LayerNormalization()
def call(self, x):
attn_output = self.attention(x, x)
x = self.layernorm1(x + attn_output)
ffn_output = self.ffn(x)
return self.layernorm2(x + ffn_output)
Practice Problems
- Implement self-attention
- Build transformer block
- Create positional encoding
- Stack transformer layers
- Use Hugging Face Transformers