파이썬_파일읽고쓰기_워드클라우드

DATA_STUDY 2024. 1. 22. 17:46

1.파일읽고쓰기

1.경로확인

# 홈 디렉토리 확인
from pathlib import Path
print(Path.home())

# 작업 디렉토리 확인
from pathlib import Path
print(Path.cwd())

# 디렉토리 내용 확인
from pathlib import Path
files = Path.cwd().glob('*') # * 모든 파일을 의미
for f in files:
     print(f)

2. 쓰기

# 파일 열기
f = open('M.txt', 'w')

# 파일 쓰기
f.write('안녕하세요?\n')# \n 줄바꿈, 개행: 커서가 다음 줄에 들어감

# 파일 닫기
f.close()

# 디렉토리 만들기
Path('Files').mkdir(exist_ok=True)

# 파일 열기
f = open('Files/M.txt', 'w')

# 파일 쓰기
f.write('모두들 안녕하세요?\n')

# 파일 닫기
f.close()

3.읽기

# 파일 열기
f = open('M.txt', 'r')

# 내용 읽기
print(f.read())

# 파일 닫기
f.close()

4. 내용 추가

# 파일 열기
f = open('M.txt', 'a') # addition

# 내용 추가
f.write('''
만나서 반갑습니다!
우라라라랄
우아러ㅣ나어
집이라어리ㅏ어
''')

# 파일 닫기
f.close()

#추가된 내용 확인하기
f = open('M.txt', 'r')
print(f.read())
f.close()

워드 클라우드 만들기

1) 텍스트파일로 저장하기

# 파일 읽기
file = open('a.txt', 'r', encoding='UTF-8')#전관 문자 - "" 이런거 있으면 오류나서 읽어오게 하기 위해서
text = file.read() 
file.close()

# 확인(50 글자만)
text[:50]

2) 텍스트 전처리
- 파일을 읽어서 split()
- 단어별 빈도수를 계산하여 딕셔너리 형태로 저장
- 분석에 의미가 없는 조사들은 제거

# 공백을 구분자로 하여 단어 단위로 자르기
wordList = text.split()  # 공백을 split('  ') 이렇게 주면 \n까지 인식해버림
                        #또한 공백을 주는 띄어쓰기 한칸을 기준으로 
                        #워드클라우드에서 개행은 문제가 됨.

# 확인(10 개만)
wordList[:10]

# 중복 단어 제거
worduniq = list(set(wordList))

# 딕셔너리 선언
wordCount = {}

# 단어별 개수 저장 
#루프돌리기
for w in worduniq:
    wordCount[w] = wordList.count(w)

#확인
#print(wordCount)

# 제외 대상 조사 
del_word = ['the','a','is','are', 'not','of','on','that','this','and','be','to', 'from']

# 제외하기
for w in del_word:   #del_word에 w가 있고
    if w in wordCount: #만에 wordCount에 w가 있다면
        del wordCount[w] #w는 dict의 key로 있으므로 key        

#print(wordCount)

3) 워드 클라우드
- wordcloud 라이브러리를 설치
- 워드 클라우드를 그립니다.

# 패키지 설치
!pip install wordcloud

# 라이브러리 불러오기
import matplotlib.pyplot as plt
from wordcloud import WordCloud 
%config InlineBackend.figure_format='retina'#고해상도

# 워드 클라우드 만들기
wordcloud = WordCloud(font_path = 'C:/Windows/fonts/HMKMRHD.TTF', #휴면둥근고딕체
                      width=2000,
                      height=1000,
                      background_color='white').generate_from_frequencies(  wordCount  )

# 표시하기
plt.figure(figsize=(12, 6))
plt.imshow(  wordcloud  )#위의 wordcloud를 만들었던것을 이미지로 보려는것
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

자연어 처리하기

#라이브러리
import pandas as pd  
import numpy as np    
import matplotlib.pyplot as plt  
from wordcloud import WordCloud  
from collections import Counter  # 단어 카운트
import re    # 정규표현식
from PIL import Image # 이미지로드

font_path = 'malgun.ttf' # 워드클라우드 한글 폰트 사용
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#파일 불러오기
train_df = pd.read_csv('.csv')
test_df = pd.read_csv('.csv')
train_df.shape,test_df.shape

data = pd.concat([train_df, test_df], axis=0).reset_index()

- 전처리 : 특수 문자 전처리 함수 생성

removal_list =  "‘, ’, ◇, ‘, ”,  ’, ', ·, \“, ·, △, ●,  , ■, (, ), \", >>, `, /, #, ∼, =,ㆍ<,>, .,?, !,【,】, …, ◆,%, ₩"
def remove_special(sentence: str = None):

    sentence = re.sub("[.,\'\"’‘”“!?]", "", sentence)
    sentence = re.sub("[^ㄱ-ㅎ가-힣a-zA-Z\\s]", " ", sentence)
    sentence = re.sub("\s+", " ", sentence)
    sentence = sentence.translate(str.maketrans(removal_list, ' '*len(removal_list)))
    sentence = sentence.strip()
    sentence = sentence.replace('\n', ' ')

    return sentence

특수문자 제거 함수 적용하기

data['a'] = ''
for i in data.index:
    data['a'].loc[i] = remove_special(data[''].loc[i])
data

줄바꿈문자 제거하기

# '\n'(줄바꿈문자) 삭제하기
for i in  data.index:
    data['a'].loc[i] = data[''].loc[i].replace('\n', ' ')
data

단어 분리하기

data['a'] = ''
for i in data.index:
    data['a'].loc[i] = data[''].loc[i]. split(' ')

data

불용어, 조사 등 제거하기

def remove_stopword(sent):
    stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','을','으로',
                 '자','에','와','한','이', '로', '에서', '하는', '하면', '하고', '요',
                 '혹시', '합니다', '감사합니다', '안녕하세요','될까요','해도','다시']
    removed = [word for word in sent if not word in stopwords] # 불용어 제거
    return removed
    
    

data['a'] = ""

for  i in data.index:
    data['a'].loc[i] = remove_stopword(data[''].loc[i])

data

- 단어 리스트 생성하기

# 단어 리스트 생성하기
word_list = []
for sent in type_questions:
    sent = sent.split(' ')
    for s in sent:
        word_list.append(s)
word_list

## 단어 리스트 생성 : 빈값들 제거 
word_list = []
for sent in type_questions:
    sent = sent.split(' ')
    for s in sent:
        if s != '' :      # 빈값 빼기
            word_list.append(s)
word_list

빈도수 카운드

# 전체 단어 빈도수 카운트하기
from collections import Counter
word_count = Counter(word_list)
# 단어별 빈도수로 정렬하기 내림차순
word_count.most_common(3)

# 워드 클라우드 생성하기

origin_cloud = WordCloud(font_path='malgun.ttf',
                         width=1200, height=1000,
                         max_font_size=800,
                         background_color='black')#옵션
w_image = origin_cloud.generate_from_frequencies(word_count) # 폰트 경로문제 cannot open resource

plt.imshow(w_image)
plt.axis('off')

저작자표시

'DATA_STUDY' 카테고리의 다른 글

데이터 분석_CNN_CIFAR10_MNIST, 시계열_RNN기본_LSTM (0)	2024.01.22
파이썬_워드클라우드 2 (0)	2024.01.22
데이터 분석_파이토치 : Pytorch3 (0)	2024.01.22
데이터 분석_파이토치 : Pytorch2 (0)	2024.01.22
데이터 분석_파이토치 : Pytorch1 (0)	2024.01.19

ABOUT ME

Data_with_U Data_with_U

1.파일읽고쓰기

워드 클라우드 만들기

자연어 처리하기

'DATA_STUDY' 카테고리의 다른 글

티스토리툴바

ABOUT ME

1.파일읽고쓰기

워드 클라우드 만들기

자연어 처리하기

'DATA_STUDY' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바