tf.train.Example

Example 使用案例

tensorflow提供的tf.placeholder这种方式读写不便于分布式。

TensorFlow提供了一个标准的读写格式和存储协议,不仅如此,TensorFlow也提供了基于多线程队列的读取方式,高效而简洁,读取速度也更快。这就是protobuf

比较好的博客参考

tensorflow高级读写教程

比较好的write使用方法如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class TFRecordsGenerator(object):
def __init__(self):
self._generator = EncoderGenerator()

def set_gmonly_mode(self):
self._generator.set_gmonly_mode()

def set_allspans_mode(self):
self._generator.set_allspans_mode()

def is_gmonly_mode(self):
return self._generator.is_gmonly_mode()

def is_allspans_mode(self):
return self._generator.is_allspans_mode()

@staticmethod
def _to_sequence_example(sample):
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
# Those two create a simple feature. The first a simple feature with one integer, whereas the second a simple
# list of integers as one feature.
def _int64_feature(value):
"""value is a simple integer."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _int64list_feature(value):
"""value is a list of integers."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _int64_feature_list(values):
""" values is a list of integers like the words (words = [2,4,6,8,10])
a feature list where each feature has only one number (a list with fixed
number of elements, specifically only one)"""
return tf.train.FeatureList(feature=[_int64_feature(v) for v in values])

def _int64list_feature_list(values):
""" like the chars = [[1,2,3], [4,5], [6], [7,8], [9,10,11,12]] a feature list where each feature can have variable
number of ements"""
return tf.train.FeatureList(feature=[_int64list_feature(v) for v in values])

def _floatlist_feature_list(values):
""" like the chars = [[0.1,0.2,0.3], [0.4,0.5]] a feature list where each feature can have variable
number of ements"""
def _floatlist_feature(value):
"""value is a list of integers."""
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
return tf.train.FeatureList(feature=[_floatlist_feature(v) for v in values])

context = tf.train.Features(feature={
"chunk_id": _bytes_feature(sample.chunk_id.encode('utf-8')),
"words_len": _int64_feature(sample.words_len),
"spans_len": _int64_feature(sample.spans_len),
"ground_truth_len": _int64_feature(sample.ground_truth_len)
})
feature_list = {
"words": _int64_feature_list(sample.words),
"chars": _int64list_feature_list(sample.chars),
"chars_len": _int64_feature_list(sample.chars_len),
"begin_span": _int64_feature_list(sample.begin_spans),
"end_span": _int64_feature_list(sample.end_spans),
"cand_entities": _int64list_feature_list(sample.cand_entities),
"cand_entities_scores": _floatlist_feature_list(sample.cand_entities_scores),
"cand_entities_labels": _int64list_feature_list(sample.cand_entities_labels),
"cand_entities_len": _int64_feature_list(sample.cand_entities_len),
"ground_truth": _int64_feature_list(sample.ground_truth)
}
if isinstance(sample, SampleEncoded):
feature_list["begin_gm"] = _int64_feature_list(sample.begin_gm)
feature_list["end_gm"] = _int64_feature_list(sample.end_gm)
feature_lists = tf.train.FeatureLists(feature_list=feature_list)

sequence_example = tf.train.SequenceExample(context=context, feature_lists=feature_lists)
return sequence_example


def process(self, filepath):
print("processing file: ", filepath)
#the name of the dataset. just extract the last part of path
filename = os.path.basename(os.path.normpath(filepath))[:-4] # omit the '.txt'
output_folder = config.base_folder+"data/tfrecords/"+args.experiment_name+"/"
output_folder += "gmonly/" if self.is_gmonly_mode() else "allspans/"
if not os.path.exists(output_folder):
os.makedirs(output_folder)
writer = tf.python_io.TFRecordWriter(output_folder+filename)
records_cnt = 0
for sample in self._generator.process(filepath):
#print(sample)
sequence_example = self._to_sequence_example(sample)
# write it to file
if sequence_example is not None:
writer.write(sequence_example.SerializeToString())
records_cnt += 1
writer.close()
print("records_cnt = ", records_cnt)

ProtoBuf

这是一份很有诚意的 Protocol Buffer 语法详解