-
Notifications
You must be signed in to change notification settings - Fork 1
/
news_data.py
100 lines (68 loc) · 2.93 KB
/
news_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import tensorflow as tf
import json
TRAIN_URL = "file:///c:/Users/Colin/Desktop/HackBU2018/fakenews_training_set.csv"
TEST_URL = "file:///c:/Users/Colin/Desktop/HackBU2018/fakenews_testing_set.csv"
CSV_COLUMN_NAMES = ['location_value', 'age_value', 'flesch_reading', 'flesch_kincaid',
'coleman_liau', 'typos_to_words',
'percent_difficult_words', 'google_search_similarity',
'fake']
TYPES = ['Real', 'Fake']
def maybe_download():
train_path = tf.keras.utils.get_file(TRAIN_URL.split('/')[-1], TRAIN_URL)
test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)
return train_path, test_path
def load_data(label_name='fake'):
train_path, test_path = maybe_download()
train = pd.read_csv(train_path, names=CSV_COLUMN_NAMES, header=0)
train_feature = train
train_label = train.pop(label_name)
test = pd.read_csv(test_path, names=CSV_COLUMN_NAMES, header=0)
test_feature = test
test_label = test.pop(label_name)
return (train_feature, train_label), (test_feature, test_label)
def train_input_fn(features, labels, batch_size):
"""An input function for training"""
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
# Shuffle, repeat, and batch the examples.
dataset = dataset.shuffle(1000).repeat().batch(batch_size)
# Return the dataset.
return dataset
def eval_input_fn(features, labels, batch_size):
"""An input function for evaluation or prediction"""
features = dict(features)
if labels is None:
# No labels, use only features.
inputs = features
else:
inputs = (features, labels)
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices(inputs)
# Batch the examples
assert batch_size is not None, "batch_size must not be None"
dataset = dataset.batch(batch_size)
# Return the dataset.
return dataset
# The remainder of this file contains a simple example of a csv parser,
# implemented using a the `Dataset` class.
# `tf.parse_csv` sets the types of the outputs to match the examples given in
# the `record_defaults` argument.
CSV_TYPES = [[0.0], [0.0], [0.0], [0.0], [0]]
def _parse_line(line):
# Decode the line into its fields
fields = tf.decode_csv(line, record_defaults=CSV_TYPES)
# Pack the result into a dictionary
features = dict(zip(CSV_COLUMN_NAMES, fields))
# Separate the label from the features
label = features.pop('Types')
return features, label
def csv_input_fn(csv_path, batch_size):
# Create a dataset containing the text lines.
dataset = tf.data.TextLineDataset(csv_path).skip(1)
# Parse each line.
dataset = dataset.map(_parse_line)
# Shuffle, repeat, and batch the examples.
dataset = dataset.shuffle(1000).repeat().batch(batch_size)
# Return the dataset.
return dataset