checkpoint

Guanheng Zhang · Guanheng Zhang · commit 40718feb5608 · 2021-02-10T12:44:24.000-08:00
diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py
@@ -7,60 +7,63 @@
    - Access to the raw data as an iterator
    - Build data processing pipeline to convert the raw text strings into torch.Tensor that can be used to train the model
    - Shuffle and iterate the data with torch.utils.data.DataLoader
-
-Access to the raw dataset iterators
------------------------------------
-
-For some advanced users, they prefer to work on the raw data strings with their custom data process pipeline. The new torchtext library provides a few raw dataset iterators, which yield the raw text strings. For example, the AG_NEWS dataset iterators yield the raw data as a tuple of label and text.
-
 """
 
+######################################################################
+# Access to the raw dataset iterators
+# -----------------------------------
+#
+#For some advanced users, they prefer to work on the raw data strings with their custom data process pipeline. The new torchtext library provides a few raw dataset iterators, which yield the raw text strings. For example, the AG_NEWS dataset iterators yield the raw data as a tuple of label and text.
+
 import torch
 # With torchtext 0.9.0 rc
 # from torchtext.datasets import AG_NEWS
 from torchtext.experimental.datasets.raw import AG_NEWS
 train_iter, = AG_NEWS(split=('train'))
 
-"""
-next(train_iter)
->>> (3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - 
-Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green 
-again.")
- 
-next(train_iter)
->>> (3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private 
-investment firm Carlyle Group,\\which has a reputation for making well-timed 
-and occasionally\\controversial plays in the defense industry, has quietly 
-placed\\its bets on another part of the market.')
- 
-next(train_iter)
->>> (3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring 
-crude prices plus worries\\about the economy and the outlook for earnings are 
-expected to\\hang over the stock market next week during the depth of 
-the\\summer doldrums.")
-
-Prepare data processing pipelines
----------------------------------
-
-We have revisited the very basic components of the torchtext library, including vocab, word vectors, tokenizer backed by regular expression, and sentencepiece. Those are the basic data processing building blocks for raw text string.
-
-Here is an example for typical NLP data processing with tokenizer and vocabulary. The first step is to build a vocabulary with the raw training dataset. We provide a function build_vocab_from_iterator to build the vocabulary from a text iterator. Users can set up the minimum frequency for the tokens to be included.
-"""
+
+# next(train_iter)
+# >>> (3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - 
+# Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green 
+# again.")
+# 
+# next(train_iter)
+# >>> (3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private 
+# investment firm Carlyle Group,\\which has a reputation for making well-timed 
+# and occasionally\\controversial plays in the defense industry, has quietly 
+# placed\\its bets on another part of the market.')
+# 
+# next(train_iter)
+# >>> (3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring 
+# crude prices plus worries\\about the economy and the outlook for earnings are 
+# expected to\\hang over the stock market next week during the depth of 
+# the\\summer doldrums.")
+
+
+######################################################################
+# Prepare data processing pipelines
+# ---------------------------------
+#
+#We have revisited the very basic components of the torchtext library, including vocab, word vectors, tokenizer backed by regular expression, and sentencepiece. Those are the basic data processing building blocks for raw text string.
+#
+#Here is an example for typical NLP data processing with tokenizer and vocabulary. The first step is to build a vocabulary with the raw training dataset. We provide a function build_vocab_from_iterator to build the vocabulary from a text iterator. Users can set up the minimum frequency for the tokens to be included.
+
 
 from torchtext.experimental.vocab import build_vocab_from_iterator
 from torchtext.experimental.transforms import basic_english_normalize
 tokenizer = basic_english_normalize()
 train_iter, = AG_NEWS(split=('train',))
 vocab = build_vocab_from_iterator(iter(tokenizer(line) for label, line in train_iter), min_freq=1)
 
-"""
-The vocabulary block converts a list of tokens into integers.
 
-vocab(['here', 'is', 'an', 'example'])
->>> [475, 21, 30, 5286]
+######################################################################
+# The vocabulary block converts a list of tokens into integers.
+#
+# vocab(['here', 'is', 'an', 'example'])
+# >>> [475, 21, 30, 5286]
+#
+# Prepare data pipeline with the tokenizer and vocabulary. The pipelines will be used for the raw data strings from the dataset iterators.
 
-Prepare data pipeline with the tokenizer and vocabulary. The pipelines will be used for the raw data strings from the dataset iterators.
-"""
 
 def generate_text_pipeline(tokenizer, vocab):
   def _forward(text):
@@ -69,23 +72,26 @@ def _forward(text):
 text_pipeline = generate_text_pipeline(basic_english_normalize(), vocab)
 label_pipeline = lambda x: int(x) - 1
 
-"""
-The text pipeline converts a text string into a list of integers based on the lookup defined in the vocab. The label pipeline converts the label into integers. For example,
 
-text_pipeline('here is the an example')
->>> [475, 21, 2, 30, 5286]
-label_pipeline('10')
->>> 9
-
-Generate data batch and iterator
---------------------------------
-
-The PyTorch data loading utility is the torch.utils.data.DataLoader class. It works with a map-style dataset that implements the getitem() and len() protocols, and represents a map from indices/keys to data samples. It also works with an iterable datasets with the shuffle argumnet of False.
+######################################################################
+# The text pipeline converts a text string into a list of integers based on the lookup defined in the vocab. The label pipeline converts the label into integers. For example,
+#
+# text_pipeline('here is the an example')
+# >>> [475, 21, 2, 30, 5286]
+# label_pipeline('10')
+# >>> 9
+#
+######################################################################
+# Generate data batch and iterator 
+# --------------------------------
+#
+#The PyTorch data loading utility is the torch.utils.data.DataLoader class. It works with a map-style dataset that implements the getitem() and len() protocols, and represents a map from indices/keys to data samples. It also works with an iterable datasets with the shuffle argumnet of False.
+#
+#Before sending to the model, collate_fn function works on a batch of samples generated from DataLoader. The input to collat_fn is a batch of data with the batch size in DataLoader, and collate_fn processes them according to the data processing pipelines declared on Step 2. Pay attention here and make sure that collate_fn is declared as a top level def. This ensures that the function is available in each worker.
+#
+#In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of nn.EmbeddingBag. The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of indidividual text entries.
 
-Before sending to the model, collate_fn function works on a batch of samples generated from DataLoader. The input to collat_fn is a batch of data with the batch size in DataLoader, and collate_fn processes them according to the data processing pipelines declared on Step 2. Pay attention here and make sure that collate_fn is declared as a top level def. This ensures that the function is available in each worker.
 
-In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of nn.EmbeddingBag. The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of indidividual text entries.
-"""
 from torch.utils.data import DataLoader
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -385,15 +391,13 @@ def predict(text, text_pipeline):
 print("This is a %s news" %ag_news_label[predict(ex_text_str, text_pipeline)])
 
 
-
-"""
-Other data processing pipeline - SentencePiece
-----------------------------------------------
-
-SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. For sentencepiece transforms in torchtext, both subword units (e.g., byte-pair-encoding (BPE) ) and unigram language model are supported. We provide a few pretrained SentencePiece models and they are accessable from PRETRAINED_SP_MODEL. Here is an example to apply SentencePiece transform to build the dataset.
-
-By using spm_transform transform in collate_batch function, you can re-run the tutorial with slightly improved results.
-"""
+##############################################
+# Other data processing pipeline - SentencePiece
+# ----------------------------------------------
+#
+# SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. For sentencepiece transforms in torchtext, both subword units (e.g., byte-pair-encoding (BPE) ) and unigram language model are supported. We provide a few pretrained SentencePiece models and they are accessable from PRETRAINED_SP_MODEL. Here is an example to apply SentencePiece transform to build the dataset.
+#
+# By using spm_transform transform in collate_batch function, you can re-run the tutorial with slightly improved results.
 
 from torchtext.experimental.transforms import (
     PRETRAINED_SP_MODEL,
@@ -405,11 +409,9 @@ def predict(text, text_pipeline):
 spm_transform = sentencepiece_processor(spm_filepath)
 sp_model = load_sp_model(spm_filepath)
 
-"""
-The sentecepiece processor converts a text string into a list of integers. You can use the decode method to convert a list of integers back to the original string.
-
-spm_transform('here is the an example')
->>> [130, 46, 9, 76, 1798]
-spm_transform.decode([6468, 17151, 4024, 8246, 16887, 87, 23985, 12, 581, 15120])
->>> 'torchtext sentencepiece processor can encode and decode'
-"""
+# The sentecepiece processor converts a text string into a list of integers. You can use the decode method to convert a list of integers back to the original string.
+#
+# spm_transform('here is the an example')
+# >>> [130, 46, 9, 76, 1798]
+# spm_transform.decode([6468, 17151, 4024, 8246, 16887, 87, 23985, 12, 581, 15120])
+# >>> 'torchtext sentencepiece processor can encode and decode'