텍스트 마이닝

eunki 2021. 5. 27. 15:41

728x90

1. 텍스트 데이터 전처리

1) 정규 표현식 적용

1-1) 영어

import re

def apply_regular_expression(text):
    text = text.lower() # 대문자를 소문자로 변경
    english = re.compile('[^ a-z]') # 띄어쓰기를 포함한 알파벳 (^: 어떤 규칙으로 시작한다는 문자열)
    result = english.sub('', text) # english 정규표현식을 text에 적용
    result = re.sub(' +', ' ', result) # 띄어쓰기가 2개 이상인 경우, 하나의 공백 문자열로 변경
    return result

df['preprocessed_text'] = df['text'].apply(lambda x: apply_regular_expression(x))
df.head()

1-2) 한국어

import re

def apply_regular_expression(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]') # 띄어쓰기 한 개를 포함한 한글만 출력
    result = hangul.sub('', text)
    return result

한국어 명사 형태소 추출

!pip install konlpy==0.5.1 jpype1 Jpype1-py3

from konlpy.tag import Okt
from collections import Counter

nouns_tagger = Okt()
nouns = nouns_tagger.nouns(apply_regular_expression("".join(df['text'].tolist()))) 
# 모든 데이터를 리스트 형태로 만든 후, 하나의 텍스트 뭉치로 변환

counter = Counter(nouns)
counter.most_common(10)

한국어 한글자 명사 제거

available_counter = Counter({x : counter[x] for x in counter if len(x) > 1})
available_counter.most_common(10)

2) 말뭉치(코퍼스) 생성

corpus = df['preprocessed_text'].tolist()
corpus

3) BoW 벡터 생성

1-1) 영어

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(tokenizer=None, stop_words="english", analyzer='word').fit(corpus)
# tokenizer: 정규표현식 함수와 같이 어떤 텍스트 데이터가 들어왔을 때, 어떻게 나눌 것인지 기준을 코딩으로 정의한 것
# stop_words: 불용어, 실질적인 의미를 가지고 있지 않은 단어들 ex) the, into
# analyzer: 분석 단위 ex) 단어, 문장

bow_vect = vect.fit_transform(corpus)
word_list = vect.get_feature_names() # 각각의 벡터들의 공간에 어떤 단어들이 들어가있는지
count_list = bow_vect.toarray().sum(axis=0) # 각각의 단어들이 몇 번 등장하는지

1-2) 한국어

from sklearn.feature_extraction.text import CountVectorizer

def text_cleaning(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]') # 정규표현식
    result = hangul.sub('', text)
    tagger = Okt() # 한국어 형태소 분석
    nouns = nouns_tagger.nouns(result)
    nouns = [x for x in nouns if len(x) > 1] # 한글자 명사 제거
    nouns = [x for x in nouns if x not in stopwords] # 불용어 제거
    return nouns

vect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
bow_vect = vect.fit_transform(df['text'].tolist())
word_list = vect.get_feature_names()
count_list = bow_vect.toarray().sum(axis=0)

import operator

word_count_dict = dict(zip(word_list, count_list))
sorted(word_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:5]  # 빈도 수가 높은 상위 5개 단어 출력

2. 텍스트 마이닝

1) 단어별 빈도 분석

1-1) 워드 클라우드 시각화

!pip install pytagcloud pygame simplejson

from collections import Counter

import random
import pytagcloud
import webbrowser

taglist = pytagcloud.make_tags(sorted(word_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:40], maxsize=60)
pytagcloud.create_tag_image(taglist, 'wordcloud_example.jpg', rectangular=False)

from IPython.display import Image
Image(filename='wordcloud_example.jpg')

1-2) 상위 빈도수 단어 출력

ranked_tags = Counter(word_count_dict).most_common(40)
ranked_tags

2) 장면별 중요 단어 시각화

2-1) TF-IDF 변환

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_vectorizer = TfidfTransformer()
tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)
tf_idf_vect[0]

2-2) 벡터 : 단어 맵핑

invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}  
# vect.vocabulary_: 벡터의 자리에 어떤 단어가 있는지 매핑 정보
invert_index_vectorizer

2-3) 중요 단어 추출 (Top 3 TF-IDF)

top_3_word = np.argsort(tf_idf_vect.toarray())[:, -3:]
df['important_word_indexes'] = pd.Series(top_3_word.tolist())
df.head()

def convert_to_word(x):
    word_list = []
    for word in x:
        word_list.append(invert_index_vectorizer[word])
    return word_list

df['important_word'] = df['important_word_indexes'].apply(lambda x: convert_to_word(x))
df.head()

728x90