MareArts Computer Vision Study.: token

Showing posts with label token. Show all posts

7/04/2023

CrossEntropyLoss example code using the input which similar with nlp token.

Refer to code

import torch
import torch.nn as nn

# Assume a batch size of 2 and a sequence length of 3, and the model's vocabulary size is 5.
# So, your predicted logits would have a shape of (batch size, sequence length, vocab size)

logits = torch.tensor([
    [[0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4, 0.5]],
    [[0.5, 0.4, 0.3, 0.2, 0.1], [0.1, 0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2, 0.1]]
])
logits = logits.view(-1, logits.shape[-1])  # Reshape logits to be 2D (N, C), where N is batch_size*seq_length, C is vocab_size

# Similarly, your labels would have a shape of (batch size, sequence length).
# These are example labels.

labels = torch.tensor([
    [0, 1, 2],
    [2, 1, 0]
])
labels = labels.view(-1)  # Reshape labels to be 1D (N)

loss_function = nn.CrossEntropyLoss()  # Initialize loss function
loss = loss_function(logits, labels)  # Compute the loss

print(loss)  # Print the loss

In this example, logits and labels are explicitly defined tensors. The values in logits represent the output from your model for each token in the sequence for each example in your batch, and the labels tensor represents the correct labels or classes for each of these tokens. nn.CrossEntropyLoss() is then used to compute the loss between the predicted logits and the actual labels.

Thank you.

🙇🏻‍♂️

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"

BertWordPieceTokenizer Toeknizer code

import os
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)

corpus_file   = ['./ratings.txt']  # data path
vocab_size    = 32000   #vocab의 크기. 보통 32,000이 좋다고 알려짐.
limit_alphabet= 6000    #merge 수행 전 initial tokens이 유지되는 숫자 제한
output_path   = 'hugging_%d'%(vocab_size)
min_frequency = 5  # 단어의 최소 발생 빈도
hf_model_path = './'

tokenizer.train(files=corpus_file,
               vocab_size=vocab_size,
               min_frequency=min_frequency,
               limit_alphabet=limit_alphabet, 
               show_progress=True)

tokenizer.save_model(hf_model_path)

BertWordPiece Tokenizer test

from transformers import BertTokenizerFast

hf_model_path = './'
tokenizer = BertTokenizerFast.from_pretrained(hf_model_path, strip_accents=False,
                                              lowercase = False)

text = "네이버 영화 평가 문장으로 토크나이저"
tokenized_input_for_pytorch = tokenizer(text, return_tensors='pt')

print("Tokens (str)      : {}".format([tokenizer.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int)      : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

# Tokens (str)      : ['[CLS]', '네이버', '영화', '평가', '문', '##장', '##으로', '토', '##크', '##나이', '##저', '[SEP]']
# Tokens (int)      : [2, 6818, 5834, 6947, 1528, 3340, 5842, 2899, 3390, 8801, 3755, 3]
# Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Sentence piece Tokenizer

import sentencepiece as spm
import os

input_file = './ratings.txt'
vocab_size = 32000

sp_model_root = 'sentencepiece'
if not os.path.isdir(sp_model_root):
    os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram' #unigram, bpe
character_coverage = 1.0 #default=0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)

Sentence piece Tokenizer test

import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('./sentencepiece/tokenizer_32000'))

text = "네이버 영화 평가 문장으로 토크나이저"
tokens = sp.encode_as_pieces(text)
ids = sp.encode_as_ids(text)

print("Tokens (str)      : {}".format(tokens))
print("Tokens (int)      : {}".format(ids))

# Tokens (str)      : ['▁네이버', '▁영화', '▁평가', '▁문', '장', '으로', '▁', '토크', '나이', '저']
# Tokens (int)      : [1209, 126, 2353, 3552, 412, 166, 123, 22627, 6361, 725]

Thank you.

🙇🏻‍♂️

5/23/2023

Create custom tokenizer simple code.

In the sample code, vocabulary is "0,1,2,3,4" and max length is 20.

from typing import List, Union

class CustomTokenizer:
    def __init__(self, vocab: Union[str, List[str]], pad_token="<PAD>", cls_token="<BOS>", sep_token="<SEP>", max_len=20):
        if isinstance(vocab, str):
            with open(vocab, 'r') as f:
                self.vocab = {word.strip(): i for i, word in enumerate(f.readlines())}
        elif isinstance(vocab, list):
            self.vocab = {word: i for i, word in enumerate(vocab)}
        else:
            raise ValueError("vocab must be either a filepath (str) or a list of words")
        
        print('vocab: ', self.vocab)
        self.pad_token = pad_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        self.max_len = max_len
        self.inv_vocab = {v: k for k, v in self.vocab.items()}

    def tokenize(self, text: str):
        tokens = [c for c in text if c in self.vocab]
        tokens = tokens[:self.max_len]
        padding_length = self.max_len - len(tokens)
        return [self.cls_token] + tokens + [self.sep_token] + [self.pad_token] * padding_length

    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.get(token, self.vocab.get(self.pad_token)) for token in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab.get(id, self.pad_token) for id in ids]



vocab = ["<PAD>", "<BOS>", "<SEP>", "0", "1", "2", "3", "4"]
with open('vocab.txt', 'w') as f:
    for token in vocab:
        f.write(token + '\n')

# Initialize your custom tokenizer
tokenizer = CustomTokenizer(vocab='vocab.txt')

# Now you can use this tokenizer to tokenize your data, study.marearts.com
tokenized_text = tokenizer.tokenize('22342')
print("tokenized_text: ", tokenized_text)

# Convert tokens to ids
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print("token_ids: ", token_ids)

# Convert ids back to tokens, marearts.com
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("tokens: ", tokens)

Thank you.

🙇🏻‍♂️

2/07/2023

Tokenizer token grouping, token entity grouping

refer to below example code:

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


words = ["on", "a", "reported", "basis", "and", "10.4%", "on", "a", "like-for-like", "basis."]
sentence = ' '.join(words)

tokens = tokenizer(sentence, return_tensors="np", max_length=128, padding='max_length')

print('origin tokenizer result -------')
print(f'words({len(words)}), {words}')
print(f'sentence: {sentence}')
print(f'tokens ({ len(tokens["input_ids"][0]) }) : {tokens}')

print('grouping tokens -------')
word_tokens_list = {x : tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in words}
print('word_tokens_list: ', word_tokens_list)

idx_for_words =[tokenizer.encode(x, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False) for x in words]
print(f'idx_for_words ({len(idx_for_words)}): ',idx_for_words)

desired_output = []
idx = 0
for token in idx_for_words:
    tokenoutput = []
    for ids in token:
        tokenoutput.append(idx)
        idx +=1
    desired_output.append(tokenoutput)

print('tokens in grouped list')
print(desired_output)

output

origin tokenizer result -------
words(10), ['on', 'a', 'reported', 'basis', 'and', '10.4%', 'on', 'a', 'like-for-like', 'basis.']
sentence: on a reported basis and 10.4% on a like-for-like basis.
tokens (128) 
input_ids:  [[ 101 1113  170 2103 3142 1105 1275  119  125  110 1113  170 1176  118
  1111  118 1176 3142  119  102    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
token_type_ids:  [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
attention_mask:  [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
grouping tokens -------
word_tokens_list:  {'on': [1113], 'a': [170], 'reported': [2103], 'basis': [3142], 'and': [1105], '10.4%': [1275, 119, 125, 110], 'like-for-like': [1176, 118, 1111, 118, 1176], 'basis.': [3142, 119]}
idx_for_words (10):  [[1113], [170], [2103], [3142], [1105], [1275, 119, 125, 110], [1113], [170], [1176, 118, 1111, 118, 1176], [3142, 119]]
tokens in grouped list
[[0], [1], [2], [3], [4], [5, 6, 7, 8], [9], [10], [11, 12, 13, 14, 15], [16, 17]]

Thank you.

🙇🏻‍♂️

www.marearts.com

6/19/2017

tip, CString token in MFC

refer to below code.

CString selectedModel;
m_ListBoxOfConnection.GetText(sel, selectedModel);
//selectedModel = "com1,model1,base";

CString comStr;
CString modelStr;
CString optionStr;

AfxExtractSubString(comStr, selectedModel, 0, ',');
AfxExtractSubString(modelStr, selectedModel, 1, ',');
AfxExtractSubString(optionStr, selectedModel, 2, ',');

//then

//comStr = com1

//modelStr = model1

//optionStr = base

Thank you.

MareArts Computer Vision Study.

Pages

MareArts ANPR mobile app

7/04/2023

CrossEntropyLoss example code using the input which similar with nlp token.

Tokeniser example source code using "BertWordPieceTokenizer", "sentencepiece"

5/23/2023

Create custom tokenizer simple code.

2/07/2023

Tokenizer token grouping, token entity grouping

6/19/2017

tip, CString token in MFC