cjwbw/starcoder2-15b 🔢📝 → 📝

▶️ 277 runs 📅 Mar 2024 ⚙️ Cog 0.9.4 🔗 GitHub 📄 Paper ⚖️ License
code-generation text-generation

About

Language Models for Code

Example Output

Prompt:

"class encoder_decoder_transformer():"

Output

class encoder_decoder_transformer():
def init(self, vocab_size, d_model, num_layers, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
self.encoder = encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
self.decoder = decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
self.final_layer = tf.keras.layers.Dense(vocab_size)

def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
    enc_output = self.encoder(inp, training, enc_padding_mask)
    dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    final_output = self.final_layer(dec_output)
    return final_output, attention_weights

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def init(self, d_model, warmup_steps=4000):
super(CustomSchedule, self).init()
self.d_model = d_model
self.d_model = tf.cast(self.d_model, tf.float32)
self.warmup_steps = warmup_steps

def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)

def create_masks(inp, tar):
enc_padding_mask = create_padding_mask(inp)
dec_padding_mask = create_padding_mask(inp)
look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
dec_target_padding_mask = create_padding_mask(tar)
combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
return enc_padding_mask, combined_mask, dec_padding_mask

def create_padding_mask(seq):
seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
return mask

def evaluate(inp_sentence):
start_token = [tokenizer_pt.vocab_size]
end_token = [tokenizer_pt.vocab_size + 1]
inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token
encoder_input = tf.expand_dims(inp_sentence, 0)
decoder_input = [tokenizer_pt.vocab_size]
output = tf.expand_dims(decoder_input, 0)
for i in range(MAX_LENGTH):
enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)
predictions, attention_weights = transformer(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask)
predictions = predictions[:, -1:, :]
predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
if predicted_id == tokenizer_pt.vocab_size+1:
return tf.squeeze(output, axis=0), attention_weights
output = tf.concat([output, predicted_id], axis=-1)
return tf.squeeze(output, axis=0), attention_weights

def plot_attention_weights(attention, sentence, result, layer):
fig = plt.figure(figsize=(16, 8))
sentence = tokenizer_pt.encode(sentence)
attention = tf.squeeze(attention[layer], axis=0)
for head in range(attention.shape[0]):
ax = fig.add_subplot(2, 4, head+1)
ax.matshow(attention[head][:-1, :], cmap='viridis')
fontdict = {'fontsize': 10}
ax.set_xticks(range(len(sentence)+2))
ax.set_yticks(range(len(result)))
ax.set_ylim(len(result)-1.5, -0.5)
ax.set_xticklabels(
['']+[tokenizer_pt.decode([i]) for i in sentence]+[''],
fontdict=fontdict, rotation=90)
ax.set_yticklabels([tokenizer_en.decode([i]) for i in result
if i < tokenizer_en.vocab_size],
fontdict=fontdict)
ax.set_xlabel('Head {}'.format(head+1))
plt.tight_layout()
plt.show()

def translate(sentence, plot=''):
result, attention_weights = evaluate(sentence)
predicted_sentence = tokenizer_en.decode([i for i in result if i < tokenizer_en.vocab_size])
print('Input: {}'.format(sentence))
print('Predicted translation: {}'.format(predicted_sentence))
if plot:
plot_attention_weights(attention_weights, sentence, result, plot)

def train_step(inp, tar):
tar_inp = tar[:, :-1]
tar_real = tar[:, 1:]
enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
with tf.GradientTape() as tape:
predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask)
loss = loss_function(tar_real, predictions)
gradients = tape.gradient(loss, transformer.trainable_variables)
optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
train_loss(loss)
train_accuracy(tar_real, predictions)

def train(EPOCHS):
for epoch in range(EPOCHS):
start = time.time()
train_loss.reset_states()
train_accuracy.reset_states()
for (batch, (inp, tar)) in enumerate(train_dataset):
train_step(inp, tar)
if batch % 50 == 0:
print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result()))
if (epoch + 1) % 5 == 0:
ckpt_save_path = ckpt_manager.save()
print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path))
print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result()))
print('Time taken for 1 epoch: {} secs
'.format(time.time() - start))

def main():
train(EPOCHS)
translate("este é um problema que temos que resolver.")
translate("os meus vizinhos ouviram sobre esta ideia.")
translate("vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram.")
translate("este é o primeiro livro que eu fiz.", plot='decoder_layer4_block2')

if name == "main":
main()
/README.md

Transformer

This is a Tensorflow implementation of the Transformer model from the paper "Attention is All You Need" by and

The model is trained on the Portuguese-English translation dataset from Tatoeba.

Requirements

  • Python 3.6
  • Tensorflow 2.0
  • Tensorflow Datasets
  • Tensorflow Text
  • Matplotlib

Usage

To train the model, run:

python transformer.py

Performance Metrics

115.26s Prediction Time
115.26s Total Time
All Input Parameters
{
  "top_k": -1,
  "top_p": 0.95,
  "prompt": "class encoder_decoder_transformer():",
  "temperature": 0.01,
  "max_new_tokens": 2000
}
Input Parameters
top_k Type: integerDefault: -1Range: -1 - ∞
When decoding text, samples from the top k most likely tokens; lower to ignore less likely tokens
top_p Type: numberDefault: 0.95Range: 0 - 1
When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens
prompt Type: stringDefault: def print_hello_world():
Text input
temperature Type: numberDefault: 0.7Range: 0.01 - 5
Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value.
max_new_tokens Type: integerDefault: 256Range: 1 - ∞
Maximum number of tokens to generate. A word is generally 2-3 tokens
Output Schema

Output

Type: arrayItems Type: string

Example Execution Logs
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Version Details
Version ID
d67b7d32b63bb8a2cf6b95c523921408e38ce7d7228fdff7b1eb636dc2c5ecd8
Version Created
March 20, 2024
Run on Replicate →