# multi/keyword_module.py import torch import requests from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, AutoTokenizer, AutoModel from konlpy.tag import Komoran from keybert import KeyBERT from bs4 import BeautifulSoup as bs # --- 요약용 KoBART --- summary_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization") summary_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization") def summarize_kobart(text, max_input_length=512): # 입력을 자르기 input_ids = summary_tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_input_length) summary_ids = summary_model.generate( input_ids, max_length=160, min_length=100, num_beams=4, repetition_penalty=2.5, no_repeat_ngram_size=3, early_stopping=True, ) return summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True) # --- KoBERT 임베딩 클래스 --- class KoBERTEmbedding: def __init__(self, model, tokenizer): self.model = model self.tokenizer = tokenizer def encode(self, documents): if isinstance(documents, str): documents = [documents] encoded_input = self.tokenizer( documents, padding=True, truncation=True, max_length=512, return_tensors="pt" ) if "token_type_ids" not in encoded_input: encoded_input["token_type_ids"] = torch.zeros_like(encoded_input["input_ids"]) with torch.no_grad(): output = self.model(**encoded_input) return output.last_hidden_state[:, 0, :].numpy() # --- 키워드 추출 --- keyword_tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1", use_fast=False) keyword_model = AutoModel.from_pretrained("skt/kobert-base-v1") kobert_embedder = KoBERTEmbedding(keyword_model, keyword_tokenizer) kw_model = KeyBERT(model=kobert_embedder) # --- 불용어 로드 + 형태소 분석기 --- komoran = Komoran() def fetch_korean_stopwords(): url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ko/master/stopwords-ko.txt" response = requests.get(url) return response.text.splitlines() stopwords = fetch_korean_stopwords() def remove_stopwords(text, stopwords): nouns = komoran.nouns(text) return " ".join([w for w in nouns if w not in stopwords and len(w) > 1]) def extract_keywords(summary_text, top_n=5): filtered = remove_stopwords(summary_text, stopwords) keywords_1st = kw_model.extract_keywords( filtered, keyphrase_ngram_range=(1, 4), stop_words=stopwords, top_n=15 ) joined = " ".join([kw for kw, _ in keywords_1st]) keywords_2nd = kw_model.extract_keywords(joined, top_n=top_n) return keywords_1st, keywords_2nd # --- 뉴스 크롤링 --- def fetch_html(url): headers = {"User-Agent": "Mozilla/5.0"} response = requests.get(url, headers=headers, timeout=5) response.raise_for_status() return bs(response.text, "html.parser") def parse_naver(soup): title = soup.select_one("h2.media_end_head_headline") or soup.title time_tag = soup.select_one("span.media_end_head_info_datestamp_time") content_area = soup.find("div", {"id": "newsct_article"}) or soup.find("div", {"id": "dic_area"}) title_text = title.get_text(strip=True) if title else "제목 없음" time_text = time_tag.get_text(strip=True) if time_tag else "시간 없음" if content_area: paragraphs = content_area.find_all("p") content = '\n'.join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else content_area.get_text(strip=True) else: content = "본문 없음" return title_text, time_text, content def parse_daum(soup): title = soup.select_one("h3.tit_view") or soup.title time_tag = soup.select_one("span.num_date") content_area = soup.find("div", {"class": "article_view"}) title_text = title.get_text(strip=True) if title else "제목 없음" time_text = time_tag.get_text(strip=True) if time_tag else "시간 없음" if content_area: paragraphs = content_area.find_all("p") content = '\n'.join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else content_area.get_text(strip=True) else: content = "본문 없음" return title_text, time_text, content