Tensorflow variable length sequence storage instance

Time:2020-5-15

problem

The problem is to store an array in tfrecord and read it


a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

I’ve stored all the pictures. This is not a small idea. It’s a one-stop operation


import tensorflow as tf
import numpy as np

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.TFRecordWriter('file')

for i in range(a.shape[0]):
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i])}

 # Create an example protocol buffer
 example = tf.train.Example(features=tf.train.Features(feature=feature))

 # Serialize to string and write on the file
 writer.write(example.SerializeToString())

writer.close()


# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'i':tf.FixedLenFeature([],tf.int64),
   'data':tf.FixedLenFeature([],tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['i'], parsed_features['data']

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))

Strange error reported, name: < unknown >, key: data, Index: 0. Number of int64 values != expected. Values size: 6 but output shape: [] This means that my data length is 6, but what I read out is []. What’s wrong? I will comment out the code I read first to see if tfreocrd has been written successfully. I find that it has been written successfully, which indicates that it’s a reading problem. I doubt that it’s because the length of each write is the reason for the change, but I don’t think it’s because the size of the picture is different. I still It can be read. I found that img. Tobytes () is used to store pictures. I converted an array into bytes, which is also used to store. Does tensorflow take this byte as an element? Although the size of each picture is different, tensorflow will take it as an element after tobytes, and then according to (height, Width, channel).

Let me try not to save as Int64, but as bytes. Another great operation

Data to bytes

# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np

def _byte_feature(value):
 return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.TFRecordWriter('file')

for i in range(a.shape[0]): # i = 0 ~ 4
 Feature = {'len': ᚄ feature (len (a [i])), change meaningless I to len for later restoration
  'data': _byte_feature (NP. Array (a [i]). Tobytes())} ා I don't know why a [i] is a list (I'll see later). It needs numpy to save bytes

 # Create an example protocol buffer
 example = tf.train.Example(features=tf.train.Features(feature=feature))

 # Serialize to string and write on the file
 writer.write(example.SerializeToString())

writer.close()

#
# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'len':tf.FixedLenFeature([],tf.int64),
   'data': TF. Fixedlenfeature ([], TF. String)} changed to string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['len'], parsed_features['data']

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))


"""
[array([6], dtype=int64), array([b'\x00\x00\x00\x006\x00\x00\x00[\x00\x00\x00\x99\x00\x00\x00\xb1\x00\x00\x00\x01\x00\x00\x00'],
 dtype=object)]
[array([5], dtype=int64), array([b'\x00\x00\x00\x002\x00\x00\x00Y\x00\x00\x00\x93\x00\x00\x00\xc4\x00\x00\x00'],
 dtype=object)]
[array([4], dtype=int64), array([b'\x00\x00\x00\x00&\x00\x00\x00O\x00\x00\x00\x9d\x00\x00\x00'],
 dtype=object)]
"""

Bytes data decoding

Here comes the desired output, but how can I decode the bytes

Method 1, we analyze it ourselves


 a,b= sess.run([i,data])
 c = np.frombuffer(b[0],dtype=np.int,count=a[0])

Method 2 using tensorflow’s analytic function

def _parse_function(example_proto):
 keys_to_features = {'len':tf.FixedLenFeature([],tf.int64),
   'data': TF. Fixedlenfeature ([], TF. String)} changed to string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 Dat = tf.decode ﹣ raw (parsed ﹣ features ['data '], TF. Int64) ﹣ this parsing function is used. We store it in Int64 format and convert it to Int64 when parsing
 return parsed_features['len'], dat
"""
[array([6]), array([[ 0, 54, 91, 153, 177, 1]])]
[array([5]), array([[ 0, 50, 89, 147, 196]])]
[array([4]), array([[ 0, 38, 79, 157]])]
"""

You can see that it is a two-dimensional array, because we use batch output. Although our batchc_size = 1, it will still be output in the format of a two-dimensional list. I need to modify something,


def _parse_function(example_proto):
 keys_to_features = {'len':tf.FixedLenFeature([1],tf.int64),
   'data':tf.FixedLenFeature([1],tf.string)} 
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 dat = tf.decode_raw(parsed_features['data'],tf.int64)
 return parsed_features['len'], dat

"""
[array([[6]]), array([[[ 0, 54, 91, 153, 177, 1]]])]
[array([[5]]), array([[[ 0, 50, 89, 147, 196]]])]
[array([[4]]), array([[[ 0, 38, 79, 157]]])]
"""

Yo, it’s three-dimensional again. Let him report a mistake

def _parse_function(example_proto):
 Keys to features = {'len': TF. Fixedlenfeature ([2], TF. Int64), ා 1 is modified to 2
   'data': TF. Fixedlenfeature ([1], TF. String)} changed to string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['len'], parsed_features['data']

"""
InvalidArgumentError: Key: len. Can't parse serialized Example.
 [[Node: ParseSingleExample/ParseSingleExample = ParseSingleExample[Tdense=[DT_STRING, DT_INT64], dense_keys=["data", "len"], dense_shapes=[[1], [2]], num_sparse=0, sparse_keys=[], sparse_types=[]](arg0, ParseSingleExample/Const, ParseSingleExample/Const_1)]]
 [[Node: IteratorGetNext_22 = IteratorGetNext[output_shapes=[[?,2], [?,1]], output_types=[DT_INT64, DT_STRING], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator_22)]]
"""

You can see the deny_ keys=[“data”, “len”], dense_ shapes=[[1], [2]],, tf.FixedLenFeature It’s to read fixed length data. I guess [] means to read all data, and [1] means to read one data. Each data may contain multiple data, such as [[1, 2], [3, 3, 4], [2]…]. Ha ha ha, I’m guessing all this. How about being my girlfriend.

Tensorflow variable length array storage

Anyway, it can be read. But if it’s a variable length array defined by ourselves, it needs to be parsed by ourselves every time, which is very troublesome (I’ve been fooling around), so tensorflow defines the parsing method tf.varlenfeature of variable length array, so we don’t need to change the side length array to bytes for further parsing, which is another operation


import tensorflow as tf
import numpy as np

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.TFRecordWriter('file')

for i in range(a.shape[0]): # i = 0 ~ 4
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i])}

 # Create an example protocol buffer
 example = tf.train.Example(features=tf.train.Features(feature=feature))

 # Serialize to string and write on the file
 writer.write(example.SerializeToString())

writer.close()


# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'i':tf.FixedLenFeature([],tf.int64),
   'data':tf.VarLenFeature(tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['i'], tf.sparse_tensor_to_dense(parsed_features['data'])

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))

"""
[array([0], dtype=int64), array([[ 0, 54, 91, 153, 177, 1]], dtype=int64)]
[array([1], dtype=int64), array([[ 0, 50, 89, 147, 196]], dtype=int64)]
[array([2], dtype=int64), array([[ 0, 38, 79, 157]], dtype=int64)]
"""

Batch output

Output or array, hahaha. Another operation


dataset = dataset.batch(2)
"""
Cannot batch tensors with different shapes in component 1. First element had shape [6] and element 1 had shape [5].
"""

This is because the shape of data in a batch must be consistent. If the length of the first element is 6 and the length of the second element is 5, an error will be reported. The way is to make up the same length, and test something else before that


a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])


for i in range(a.shape[0]):
 print(type(a[i]))

"""
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
"""

It can be found that each data of array with different length is a list (I thought it was an object at first). Then make up


a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196,0],
  [0, 38, 79, 157,0,0],
  [0, 49, 89, 147, 177,0],
  [0, 32, 73, 145,0,0]])


for i in range(a.shape[0]):
 print(type(a[i]))

"""
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
"""

The return is numpy. Why do you do it?


def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

Tensorflow requires us to enter a list or a numpy.darry directly. If the list contains numpy.darry [numpy. Darry], an error will be reported. The array above has a long time and side. When it returns, there is no error in the list. Let’s make it up


a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196,0],
  [0, 38, 79, 157,0,0],
  [0, 49, 89, 147, 177,0],
  [0, 32, 73, 145,0,0]])

"""
TypeError: only size-1 arrays can be converted to Python scalars
""" 

This is because the returned value is not list, but numpy.darry. In the “Int64” feature function, it is determined that numpy.darry is not list first, so when it is changed to [numpy. Darry], an error is reported. You can make some changes. One way is to change numpy.darry to list


for i in range(a.shape[0]): # i = 0 ~ 4
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i].tolist())}

In this way, we can modify the value of batch


dataset = dataset.batch(2)

"""
[array([0, 2], dtype=int64), array([[ 0, 54, 91, 153, 177, 1],
 [ 0, 38, 79, 157, 0, 0]], dtype=int64)]
[array([1, 3], dtype=int64), array([[ 0, 50, 89, 147, 196, 0],
 [ 0, 49, 89, 147, 177, 0]], dtype=int64)]
[array([4, 0], dtype=int64), array([[ 0, 32, 73, 145, 0, 0],
 [ 0, 54, 91, 153, 177, 1]], dtype=int64)]
"""

Of course, tensorflow doesn’t let me make up by myself. It has already provided the function of making uppadded_batch


# -*- coding: utf-8 -*-

import tensorflow as tf

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

a = [[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]]

writer = tf.python_io.TFRecordWriter('file')

for v in a: # i = 0 ~ 4
 feature = {'data': _int64_feature(v)}

 # Create an example protocol buffer
 example = tf.train.Example(features=tf.train.Features(feature=feature))

 # Serialize to string and write on the file
 writer.write(example.SerializeToString())

writer.close()


# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'data':tf.VarLenFeature(tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return tf.sparse_tensor_to_dense( parsed_features['data'])

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.padded_batch(2,padded_shapes=([None]))
iterator = dataset.make_one_shot_iterator()
data = iterator.get_next()
with tf.Session() as sess:
 print(sess.run([data]))
 print(sess.run([data]))
 print(sess.run([data]))


"""
[array([[ 0, 54, 91, 153, 177, 1],
 [ 0, 50, 89, 147, 196, 0]])]
[array([[ 0, 38, 79, 157, 0],
 [ 0, 49, 89, 147, 177]])]
[array([[ 0, 32, 73, 145, 0, 0],
 [ 0, 54, 91, 153, 177, 1]])]
"""

You can see that it’s really automatic.

Picture batch

Test the picture data directly


# -*- coding: utf-8 -*-

import tensorflow as tf
import matplotlib.pyplot as plt
def _byte_feature(value):
 return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

files = tf.gfile.Glob('*.jpeg')
writer = tf.python_io.TFRecordWriter('file')
for file in files:

 with tf.gfile.FastGFile(file,'rb') as f:
 img_buff = f.read()
 feature = {'img': _byte_feature(tf.compat.as_bytes(img_buff))}
 example = tf.train.Example(features=tf.train.Features(feature=feature))
 writer.write(example.SerializeToString())
writer.close()


filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'img':tf.FixedLenFeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 image = tf.image.decode_jpeg(parsed_features['img'])
 return image

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(2)
iterator = dataset.make_one_shot_iterator()
image = iterator.get_next()

with tf.Session() as sess:
 img = sess.run([image])
 print(len(img))
 print(img[0].shape)
 plt.imshow(img[0][0])

"""
Cannot batch tensors with different shapes in component 0. First element had shape [440,440,3] and element 1 had shape [415,438,3].
"""

See? If the size of the pictures in a batch is different, you can’t batch. We must resize the pictures in a batch to the same generation size.

def _parse_function(example_proto):
 keys_to_features = {'img':tf.FixedLenFeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 image = tf.image.decode_jpeg(parsed_features['img'])
 Image = tf.image.convert ﹐ image ﹐ dtype (image, TF. Float32) ﹐ direct resize will convert uint8 to float type, but plt.imshow can only display uint8 or float type between 0-1. This function is to convert uint8 to float type between 0-1, equivalent to dividing by 255.0
 image = tf.image.resize_images(image,(224,224))
 return image

But sometimes we want to input different sizes of pictures, so we don’t need to resize, so we can only set batch_size = 1. The image shape in a batch must be the same. We can compromise the training by using the dynamic filling interface provided by tensorflow to fill the image in a batch with the same shape.


dataset = dataset.padded_batch(2,padded_shapes=([None,None,3]))

What if we want to save the name of the picture as a label?

# -*- coding: utf-8 -*-

import tensorflow as tf
import matplotlib.pyplot as plt
import os

out_charset="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

def _byte_feature(value):
 return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(values):
 if not isinstance(values,list):
 values = [values]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

files = tf.gfile.Glob('*.jpg')
writer = tf.python_io.TFRecordWriter('file')
for file in files:
 with tf.gfile.FastGFile(file,'rb') as f:
 img_buff = f.read()
 filename = os.path.basename(file).split('.')[0]
 label = list(map(lambda x:out_charset.index(x),filename))
 feature = {'label':_int64_feature(label),
  'filename':_byte_feature(tf.compat.as_bytes(filename)),
  'img': _byte_feature(tf.compat.as_bytes(img_buff))}
 example = tf.train.Example(features=tf.train.Features(feature=feature))
 writer.write(example.SerializeToString())
writer.close()


filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {
  'label':tf.VarLenFeature(tf.int64),
  'filename':tf.FixedLenFeature([],tf.string),
  'img':tf.FixedLenFeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 label = tf.sparse_tensor_to_dense(parsed_features['label'])
 filename = parsed_features['filename']
 image = tf.image.decode_jpeg(parsed_features['img'])
 return image,label,filename

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.padded_batch(3,padded_shapes=([None,None,3],[None],[]))
#Because there are three returned images, each of them must have padded_shapes, but the decoded image and label are both longer
#Therefore, pad none is required, while filename is not decoded, and the returned value is byte type with only one value, so pad is not required
iterator = dataset.make_one_shot_iterator()
image,label,filename = iterator.get_next()

with tf.Session() as sess:
 print(label.eval())

Blind trial

What if the data written is a list


a = np.arange(16).reshape(2,4,2)

"""
TypeError: [0, 1] has type list, but expected one of: int, long
"""

But think about it. TF. Train. Feature (int64_list = TF. Train. Int64list (value = value)) is the function that stores the list with data type Int64. But what if we want to store word vectors? For example, a sentence is a sample S1 = ‘I love you’. If one hot code is used, I = [0,0,1], love = [0,1,0], you = [1,0,0], S1 = [[0,0,1], [0,1,0], [1,0,0]. How to store this sample?

The above tensorflow variable length sequence storage example is all the content that Xiaobian shared with you. I hope it can give you a reference, and I hope you can support developer more.