%%capture
!pip install --upgrade transformers==4.41.2 sentence-transformers==3.0.1 gensim==4.3.2 scikit-learn==1.5.0 accelerate==0.31.0 peft==0.11.1 scipy==1.10.1 numpy==1.26.4

import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=False,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

/root/.pyenv/versions/3.11.1/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:36<00:00, 18.39s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>"

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=20
)

# Print the output
print(tokenizer.decode(generation_output[0]))

You are not running the flash-attention implementation, expect numerical differences.

Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|> Subject: Sincere Apologies for the Gardening Mishap


Dear

print(input_ids)

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363,   278, 25305,
           293, 16423,   292,   286,   728,   481, 29889, 12027,  7420,   920,
           372,  9559, 29889, 32001]], device='cuda:0')

for id in input_ids[0]:
   print(tokenizer.decode(id))

Write
an
email
apolog
izing
to
Sarah
for
the
trag
ic
garden
ing
m
ish
ap
.
Exp
lain
how
it
happened
.
<|assistant|>

generation_output

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363,   278, 25305,
           293, 16423,   292,   286,   728,   481, 29889, 12027,  7420,   920,
           372,  9559, 29889, 32001,  3323,   622, 29901,   317,  3742,   406,
          6225, 11763,   363,   278, 19906,   292,   341,   728,   481,    13,
            13,    13, 29928,   799]], device='cuda:0')

print(tokenizer.decode(3323))
print(tokenizer.decode(622))
print(tokenizer.decode([3323, 622]))
print(tokenizer.decode(29901))

Sub
ject
Subject
:

from transformers import AutoModelForCausalLM, AutoTokenizer

colors_list = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens(sentence, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids
    for idx, t in enumerate(token_ids):
        print(
            #f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +
            #f'\x1b[38;2;{colors_list[idx % len(colors_list)]}m' +  # 前景色
            f'\x1b[48;2;{colors_list[idx % len(colors_list)]}m' +  # 背景色
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
        )

/root/.pyenv/versions/3.11.1/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

text = """
English and CAPITALIZATION
🎵 鸟
show_tokens False None elif == >= else: two tabs:"    " Three tabs: "       "
12.0*50=600
"""

show_tokens(text, "bert-base-uncased")

[CLS] english and capital ##ization [UNK] [UNK] show _ token ##s false none eli ##f = = > = else : two tab ##s : " " three tab ##s : " " 12 . 0 * 50 = 600 [SEP]

show_tokens(text, "bert-base-cased")

[CLS] English and CA ##PI ##TA ##L ##I ##Z ##AT ##ION [UNK] [UNK] show _ token ##s F ##als ##e None el ##if = = > = else : two ta ##bs : " " Three ta ##bs : " " 12 . 0 * 50 = 600 [SEP]

show_tokens(text, "gpt2")


 English  and  CAP ITAL IZ ATION 
 � � �  � � � 
 show _ t ok ens  False  None  el if  ==  >=  else :  two  tabs :"        "  Three  tabs :  "              " 
 12 . 0 * 50 = 600

show_tokens(text, "google/flan-t5-small")

English and CA PI TAL IZ ATION  <unk>  <unk> show _ to ken s Fal s e None  e l if = = > = else : two tab s : " " Three tab s : " " 12. 0 * 50 = 600  </s>

# The official is `tiktoken` but this the same tokenizer on the HF platform
show_tokens(text, "Xenova/gpt-4")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 English  and  CAPITAL IZATION 
 � � �  � � � 
 show _tokens  False  None  elif  ==  >=  else :  two  tabs :"      "  Three  tabs :  "         "
 12 . 0 * 50 = 600

# You need to request access before being able to use this tokenizer
show_tokens(text, "bigcode/starcoder2-15b")


 English  and  CAPITAL IZATION 
 � � �   � � 
 show _ tokens  False  None  elif  ==  >=  else :  two  tabs :"      "  Three  tabs :  "         " 
 1 2 . 0 * 5 0 = 6 0 0

show_tokens(text, "facebook/galactica-1.3b")


 English  and  CAP ITAL IZATION 
 � � � �  � � � 
 show _ tokens  False  None  elif   ==   > =  else :  two  t abs : "      "  Three  t abs :   "         " 
 1 2 . 0 * 5 0 = 6 0 0

show_tokens(text, "microsoft/Phi-3-mini-4k-instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

 
 English and C AP IT AL IZ ATION 
 � � � �  � � � 
 show _ to kens False None elif == >= else : two tabs :"    " Three tabs : "       " 
 1 2 . 0 * 5 0 = 6 0 0

text_chinese = """
中文是咋样分词的？
试试"bert-base-chinese"和"Qwen/Qwen2.5-1.5B"
"""

show_tokens(text_chinese, "bert-base-chinese")

[CLS] 中 文 是 咋 样 分 词 的 ？ 试 试 " be ##rt - base - chinese " 和 " [UNK] / [UNK] . 5 - 1 . [UNK] " [SEP]

show_tokens(text_chinese, "Qwen/Qwen2.5-1.5B")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 中文 是 咋 样 分 词 的 ？
 试试 " bert -base -ch inese " 和 " Q wen /Q wen 2 . 5 - 1 . 5 B "

from transformers import AutoModel, AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

# Load a language model
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

# Tokenize the sentence
tokens = tokenizer('Hello world', return_tensors='pt')

# Process the tokens
output = model(**tokens)[0]

output.shape

torch.Size([1, 4, 384])

for token in tokens['input_ids'][0]:
    print(tokenizer.decode(token))

[CLS]
Hello
 world
[SEP]

output

tensor([[[-3.4816,  0.0861, -0.1819,  ..., -0.0612, -0.3911,  0.3017],
         [ 0.1898,  0.3208, -0.2315,  ...,  0.3714,  0.2478,  0.8048],
         [ 0.2071,  0.5036, -0.0485,  ...,  1.2175, -0.2292,  0.8582],
         [-3.4278,  0.0645, -0.1427,  ...,  0.0658, -0.4367,  0.3834]]],
       grad_fn=<NativeLayerNormBackward0>)

from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to text embeddings
vector = model.encode("Best movie ever!")

/root/.pyenv/versions/3.11.1/lib/python3.11/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

vector.shape

(768,)

import gensim.downloader as api

# Download embeddings (66MB, glove, trained on wikipedia, vector size: 50)
# Other options include "word2vec-google-news-300"
# More options at https://github.com/RaRe-Technologies/gensim-data
model = api.load("glove-wiki-gigaword-50")

[==================================================] 100.0% 66.0/66.0MB downloaded

model.most_similar([model['king']], topn=11)

[('king', 1.0000001192092896),
 ('prince', 0.8236179351806641),
 ('queen', 0.7839043140411377),
 ('ii', 0.7746230363845825),
 ('emperor', 0.7736247777938843),
 ('son', 0.766719400882721),
 ('uncle', 0.7627150416374207),
 ('kingdom', 0.7542161345481873),
 ('throne', 0.7539914846420288),
 ('brother', 0.7492411136627197),
 ('ruler', 0.7434253692626953)]

model.most_similar([model['pig']], topn=11)

[('pig', 0.9999998807907104),
 ('pigs', 0.8334351181983948),
 ('cow', 0.8290978074073792),
 ('rabbit', 0.8160701394081116),
 ('sheep', 0.7890334129333496),
 ('goat', 0.7866354584693909),
 ('meat', 0.7607064247131348),
 ('elephant', 0.7575558423995972),
 ('cows', 0.7575273513793945),
 ('dog', 0.7490062117576599),
 ('chickens', 0.7473998069763184)]

import pandas as pd
from urllib import request

# Get the playlist dataset file
data = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt')

# Parse the playlist dataset file. Skip the first two lines as
# they only contain metadata
lines = data.read().decode("utf-8").split('\n')[2:]

# Remove playlists with only one song
playlists = [s.rstrip().split() for s in lines if len(s.split()) > 1]

# Load song metadata
songs_file = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt')
songs_file = songs_file.read().decode("utf-8").split('\n')
songs = [s.rstrip().split('\t') for s in songs_file]
songs_df = pd.DataFrame(data=songs, columns = ['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

print( 'Playlist #1:\n ', playlists[0], '\n')
print( 'Playlist #2:\n ', playlists[1])

Playlist #1:
  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '2', '42', '43', '44', '45', '46', '47', '48', '20', '49', '8', '50', '51', '52', '53', '54', '55', '56', '57', '25', '58', '59', '60', '61', '62', '3', '63', '64', '65', '66', '46', '47', '67', '2', '48', '68', '69', '70', '57', '50', '71', '72', '53', '73', '25', '74', '59', '20', '46', '75', '76', '77', '59', '20', '43'] 

Playlist #2:
  ['78', '79', '80', '3', '62', '81', '14', '82', '48', '83', '84', '17', '85', '86', '87', '88', '74', '89', '90', '91', '4', '73', '62', '92', '17', '53', '59', '93', '94', '51', '50', '27', '95', '48', '96', '97', '98', '99', '100', '57', '101', '102', '25', '103', '3', '104', '105', '106', '107', '47', '108', '109', '110', '111', '112', '113', '25', '63', '62', '114', '115', '84', '116', '117', '118', '119', '120', '121', '122', '123', '50', '70', '71', '124', '17', '85', '14', '82', '48', '125', '47', '46', '72', '53', '25', '73', '4', '126', '59', '74', '20', '43', '127', '128', '129', '13', '82', '48', '130', '131', '132', '133', '134', '135', '136', '137', '59', '46', '138', '43', '20', '139', '140', '73', '57', '70', '141', '3', '1', '74', '142', '143', '144', '145', '48', '13', '25', '146', '50', '147', '126', '59', '20', '148', '149', '150', '151', '152', '56', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '170', '171', '172', '173', '174', '175', '60', '176', '51', '177', '178', '179', '180', '181', '182', '183', '184', '185', '57', '186', '187', '188', '189', '190', '191', '46', '192', '193', '194', '195', '196', '197', '198', '25', '199', '200', '49', '201', '100', '202', '203', '204', '205', '206', '207', '32', '208', '209', '210']

from gensim.models import Word2Vec

# Train our Word2Vec model
model = Word2Vec(
    playlists, vector_size=32, window=20, negative=50, min_count=1, workers=4
)

song_id = 2172

# Ask the model for songs similar to song #2172
model.wv.most_similar(positive=str(song_id))

[('3167', 0.9979863166809082),
 ('2849', 0.9977035522460938),
 ('2640', 0.9962807893753052),
 ('2976', 0.9962215423583984),
 ('6626', 0.9958192706108093),
 ('10084', 0.9958130717277527),
 ('5634', 0.9956934452056885),
 ('6658', 0.9956392645835876),
 ('1922', 0.9955645799636841),
 ('5549', 0.995154082775116)]

print(songs_df.iloc[2172])

title     Fade To Black
artist        Metallica
Name: 2172 , dtype: object

import numpy as np

def print_recommendations(song_id):
    similar_songs = np.array(
        model.wv.most_similar(positive=str(song_id),topn=5)
    )[:,0]
    return  songs_df.iloc[similar_songs]

# Extract recommendations
print_recommendations(2172)

print_recommendations(2172)

print_recommendations(842)

	title	artist
id
3167	Unchained	Van Halen
2849	Run To The Hills	Iron Maiden
2640	Red Barchetta	Rush
2976	I Don't Know	Ozzy Osbourne
6626	Blackout	Scorpions

	title	artist
id
3167	Unchained	Van Halen
2849	Run To The Hills	Iron Maiden
2640	Red Barchetta	Rush
2976	I Don't Know	Ozzy Osbourne
6626	Blackout	Scorpions

	title	artist
id
27081	Give Me Everything (w\/ Ne-Yo, Afrojack & Nayer)	Pitbull
886	Heartless	Kanye West
1418	Tick Tock	Kesha
330	Hate It Or Love It (w\/ 50 Cent)	The Game
413	If I Ruled The World (Imagine That) (w\/ Laury...	Nas

Chapter 2 - Tokens and Token Embeddings

[OPTIONAL] - Installing Packages on

Downloading and Running An LLM¶

Comparing Trained LLM Tokenizers¶

Contextualized Word Embeddings From a Language Model (Like BERT)¶

Text Embeddings (For Sentences and Whole Documents)¶

Word Embeddings Beyond LLMs¶

Recommending songs by embeddings¶