MareArts ANPR mobile app

Showing posts with label token. Show all posts
Showing posts with label token. Show all posts

7/04/2023

CrossEntropyLoss example code using the input which similar with nlp token.

 Refer to code

.

import torch
import torch.nn as nn

# Assume a batch size of 2 and a sequence length of 3, and the model's vocabulary size is 5.
# So, your predicted logits would have a shape of (batch size, sequence length, vocab size)

logits = torch.tensor([
[[0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4, 0.5]],
[[0.5, 0.4, 0.3, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1]]
])
logits = logits.view(-1, logits.shape[-1]) # Reshape logits to be 2D (N, C), where N is batch_size*seq_length, C is vocab_size

# Similarly, your labels would have a shape of (batch size, sequence length).
# These are example labels.

labels = torch.tensor([
[0, 1, 2],
[2, 1, 0]
])
labels = labels.view(-1) # Reshape labels to be 1D (N)

loss_function = nn.CrossEntropyLoss() # Initialize loss function
loss = loss_function(logits, labels) # Compute the loss

print(loss) # Print the loss

..




In this example, logits and labels are explicitly defined tensors. The values in logits represent the output from your model for each token in the sequence for each example in your batch, and the labels tensor represents the correct labels or classes for each of these tokens. nn.CrossEntropyLoss() is then used to compute the loss between the predicted logits and the actual labels.




Thank you.

πŸ™‡πŸ»‍♂️

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"


BertWordPieceTokenizer Toeknizer code

.

import os
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)

corpus_file = ['./ratings.txt'] # data path
vocab_size = 32000 #vocab의 크기. 보톡 32,000이 μ’‹λ‹€κ³  μ•Œλ €μ§.
limit_alphabet= 6000 #merge μˆ˜ν–‰ μ „ initial tokens이 μœ μ§€λ˜λŠ” 숫자 μ œν•œ
output_path = 'hugging_%d'%(vocab_size)
min_frequency = 5 # λ‹¨μ–΄μ˜ μ΅œμ†Œ λ°œμƒ λΉˆλ„
hf_model_path = './'

tokenizer.train(files=corpus_file,
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
show_progress=True)

tokenizer.save_model(hf_model_path)

..


BertWordPiece Tokenizer test

.

from transformers import BertTokenizerFast

hf_model_path = './'
tokenizer = BertTokenizerFast.from_pretrained(hf_model_path, strip_accents=False,
lowercase = False)

text = "넀이버 μ˜ν™” 평가 λ¬Έμž₯으둜 ν† ν¬λ‚˜μ΄μ €"
tokenized_input_for_pytorch = tokenizer(text, return_tensors='pt')

print("Tokens (str) : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int) : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

# Tokens (str) : ['[CLS]', '넀이버', 'μ˜ν™”', '평가', 'λ¬Έ', '##μž₯', '##으둜', 'ν† ', '##크', '##λ‚˜μ΄', '##μ €', '[SEP]']
# Tokens (int) : [2, 6818, 5834, 6947, 1528, 3340, 5842, 2899, 3390, 8801, 3755, 3]
# Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

..


Sentence piece Tokenizer

.

import sentencepiece as spm
import os

input_file = './ratings.txt'
vocab_size = 32000

sp_model_root = 'sentencepiece'
if not os.path.isdir(sp_model_root):
os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram' #unigram, bpe
character_coverage = 1.0 #default=0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)

..


Sentence piece Tokenizer test

.

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('./sentencepiece/tokenizer_32000'))

text = "넀이버 μ˜ν™” 평가 λ¬Έμž₯으둜 ν† ν¬λ‚˜μ΄μ €"
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)

print("Tokens (str) : {}".format(tokens))
print("Tokens (int) : {}".format(ids))

# Tokens (str) : ['▁넀이버', '▁μ˜ν™”', '▁평가', '▁λ¬Έ', 'μž₯', '으둜', '▁', '토크', 'λ‚˜μ΄', 'μ €']
# Tokens (int) : [1209, 126, 2353, 3552, 412, 166, 123, 22627, 6361, 725]


..


Thank you.

πŸ™‡πŸ»‍♂️


5/23/2023

Create custom tokenizer simple code.

In the sample code, vocabulary is "0,1,2,3,4" and max length is 20.


.

from typing import List, Union

class CustomTokenizer:
def __init__(self, vocab: Union[str, List[str]], pad_token="<PAD>", cls_token="<BOS>", sep_token="<SEP>", max_len=20):
if isinstance(vocab, str):
with open(vocab, 'r') as f:
self.vocab = {word.strip(): i for i, word in enumerate(f.readlines())}
elif isinstance(vocab, list):
self.vocab = {word: i for i, word in enumerate(vocab)}
else:
raise ValueError("vocab must be either a filepath (str) or a list of words")
print('vocab: ', self.vocab)
self.pad_token = pad_token
self.cls_token = cls_token
self.sep_token = sep_token
self.max_len = max_len
self.inv_vocab = {v: k for k, v in self.vocab.items()}

def tokenize(self, text: str):
tokens = [c for c in text if c in self.vocab]
tokens = tokens[:self.max_len]
padding_length = self.max_len - len(tokens)
return [self.cls_token] + tokens + [self.sep_token] + [self.pad_token] * padding_length

def convert_tokens_to_ids(self, tokens):
return [self.vocab.get(token, self.vocab.get(self.pad_token)) for token in tokens]

def convert_ids_to_tokens(self, ids):
return [self.inv_vocab.get(id, self.pad_token) for id in ids]



vocab = ["<PAD>", "<BOS>", "<SEP>", "0", "1", "2", "3", "4"]
with open('vocab.txt', 'w') as f:
for token in vocab:
f.write(token + '\n')

# Initialize your custom tokenizer
tokenizer = CustomTokenizer(vocab='vocab.txt')

# Now you can use this tokenizer to tokenize your data, study.marearts.com
tokenized_text = tokenizer.tokenize('22342')
print("tokenized_text: ", tokenized_text)

# Convert tokens to ids
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print("token_ids: ", token_ids)

# Convert ids back to tokens, marearts.com
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("tokens: ", tokens)

..


Thank you.

πŸ™‡πŸ»‍♂️


2/07/2023

Tokenizer token grouping, token entity grouping

 refer to below example code:


..

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


words = ["on", "a", "reported", "basis", "and", "10.4%", "on", "a", "like-for-like", "basis."]
sentence = ' '.join(words)

tokens = tokenizer(sentence, return_tensors="np", max_length=128, padding='max_length')

print('origin tokenizer result -------')
print(f'words({len(words)}), {words}')
print(f'sentence: {sentence}')
print(f'tokens ({ len(tokens["input_ids"][0]) }) : {tokens}')

print('grouping tokens -------')
word_tokens_list = {x : tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in words}
print('word_tokens_list: ', word_tokens_list)

idx_for_words =[tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in words]
print(f'idx_for_words ({len(idx_for_words)}): ',idx_for_words)

desired_output = []
idx = 0
for token in idx_for_words:
tokenoutput = []
for ids in token:
tokenoutput.append(idx)
idx +=1
desired_output.append(tokenoutput)

print('tokens in grouped list')
print(desired_output)

..



output

..

origin tokenizer result -------
words(10), ['on', 'a', 'reported', 'basis', 'and', '10.4%', 'on', 'a', 'like-for-like', 'basis.']
sentence: on a reported basis and 10.4% on a like-for-like basis.
tokens (128)
input_ids: [[ 101 1113 170 2103 3142 1105 1275 119 125 110 1113 170 1176 118
1111 118 1176 3142 119 102 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0]]
token_type_ids: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
attention_mask: [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
grouping tokens -------
word_tokens_list: {'on': [1113], 'a': [170], 'reported': [2103], 'basis': [3142], 'and': [1105], '10.4%': [1275, 119, 125, 110], 'like-for-like': [1176, 118, 1111, 118, 1176], 'basis.': [3142, 119]}
idx_for_words (10): [[1113], [170], [2103], [3142], [1105], [1275, 119, 125, 110], [1113], [170], [1176, 118, 1111, 118, 1176], [3142, 119]]
tokens in grouped list
[[0], [1], [2], [3], [4], [5, 6, 7, 8], [9], [10], [11, 12, 13, 14, 15], [16, 17]]

..


Thank you.

πŸ™‡πŸ»‍♂️

www.marearts.com

6/19/2017

tip, CString token in MFC

refer to below code.


CString selectedModel;
m_ListBoxOfConnection.GetText(sel, selectedModel);
//selectedModel = "com1,model1,base";

CString comStr;
CString modelStr;
CString optionStr;

AfxExtractSubString(comStr, selectedModel, 0, ',');
AfxExtractSubString(modelStr, selectedModel, 1, ',');
AfxExtractSubString(optionStr, selectedModel, 2, ',');


//then
//comStr = com1
//modelStr = model1
//optionStr = base


Thank you.