로이터 뉴스 데이터셋


from tensorflow.keras.datasets import reuters

(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)   # 단어를 10000개로 제한
len(train_data)

8982

len(test_data)

2246

print(train_data[10])   # 각 샘플은 정수 리스트 (단어의 인덱스들)

[1, 245, 273, 207, 156, 53, 74, 160, 26, 14, 46, 296, 26, 39, 74, 2979, 3554, 14, 46, 4689, 4329, 86, 61, 3499, 4795, 14, 61, 451, 4329, 17, 12]

train_labels   # 샘플에 연결된 레이블은 토픽의 인덱스로 0과 45 사이의 정수

array([ 3, 4, 3, ..., 25, 3, 25])

word_index = reuters.get_word_index()
reverse_word_index = dict(
    [(value, key) for (key, value) in word_index.items()])
decord_newswire = " ".join(
    [reverse_word_index.get(i-3, "?") for i in train_data[0]])   # 0,1,2는 '패딩', '문서 시작', '사전에 없음'을 위한 인덱스이므로 3을 뺌
print(decord_newswire)

? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3