fastapi-stock-api / keyword_module.py
Leesn465's picture
FastAPI for Hugging Face Space: initial setup and files
748bd71
raw
history blame
4.43 kB
# multi/keyword_module.py
import torch
import requests
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, AutoTokenizer, AutoModel
from konlpy.tag import Komoran
from keybert import KeyBERT
from bs4 import BeautifulSoup as bs
# --- 요약용 KoBART ---
summary_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization")
summary_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization")
def summarize_kobart(text, max_input_length=512):
# 입력을 자르기
input_ids = summary_tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_input_length)
summary_ids = summary_model.generate(
input_ids,
max_length=160,
min_length=100,
num_beams=4,
repetition_penalty=2.5,
no_repeat_ngram_size=3,
early_stopping=True,
)
return summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# --- KoBERT 임베딩 클래스 ---
class KoBERTEmbedding:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def encode(self, documents):
if isinstance(documents, str):
documents = [documents]
encoded_input = self.tokenizer(
documents,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt"
)
if "token_type_ids" not in encoded_input:
encoded_input["token_type_ids"] = torch.zeros_like(encoded_input["input_ids"])
with torch.no_grad():
output = self.model(**encoded_input)
return output.last_hidden_state[:, 0, :].numpy()
# --- 키워드 추출 ---
keyword_tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1", use_fast=False)
keyword_model = AutoModel.from_pretrained("skt/kobert-base-v1")
kobert_embedder = KoBERTEmbedding(keyword_model, keyword_tokenizer)
kw_model = KeyBERT(model=kobert_embedder)
# --- 불용어 로드 + 형태소 분석기 ---
komoran = Komoran()
def fetch_korean_stopwords():
url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ko/master/stopwords-ko.txt"
response = requests.get(url)
return response.text.splitlines()
stopwords = fetch_korean_stopwords()
def remove_stopwords(text, stopwords):
nouns = komoran.nouns(text)
return " ".join([w for w in nouns if w not in stopwords and len(w) > 1])
def extract_keywords(summary_text, top_n=5):
filtered = remove_stopwords(summary_text, stopwords)
keywords_1st = kw_model.extract_keywords(
filtered,
keyphrase_ngram_range=(1, 4),
stop_words=stopwords,
top_n=15
)
joined = " ".join([kw for kw, _ in keywords_1st])
keywords_2nd = kw_model.extract_keywords(joined, top_n=top_n)
return keywords_1st, keywords_2nd
# --- 뉴스 크롤링 ---
def fetch_html(url):
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers, timeout=5)
response.raise_for_status()
return bs(response.text, "html.parser")
def parse_naver(soup):
title = soup.select_one("h2.media_end_head_headline") or soup.title
time_tag = soup.select_one("span.media_end_head_info_datestamp_time")
content_area = soup.find("div", {"id": "newsct_article"}) or soup.find("div", {"id": "dic_area"})
title_text = title.get_text(strip=True) if title else "제목 없음"
time_text = time_tag.get_text(strip=True) if time_tag else "시간 없음"
if content_area:
paragraphs = content_area.find_all("p")
content = '\n'.join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else content_area.get_text(strip=True)
else:
content = "본문 없음"
return title_text, time_text, content
def parse_daum(soup):
title = soup.select_one("h3.tit_view") or soup.title
time_tag = soup.select_one("span.num_date")
content_area = soup.find("div", {"class": "article_view"})
title_text = title.get_text(strip=True) if title else "제목 없음"
time_text = time_tag.get_text(strip=True) if time_tag else "시간 없음"
if content_area:
paragraphs = content_area.find_all("p")
content = '\n'.join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else content_area.get_text(strip=True)
else:
content = "본문 없음"
return title_text, time_text, content