Spaces:
Runtime error
Runtime error
| # partially from https://gist.github.com/gaulinmp/da5825de975ed0ea6a24186434c24fe4 | |
| from nltk.util import ngrams | |
| from nltk.corpus import stopwords | |
| import spacy | |
| import pandas as pd | |
| import re | |
| from itertools import chain | |
| from collections import Counter | |
| from datasets import load_dataset | |
| nlp = spacy.load("en_core_web_sm") | |
| nlp.add_pipe("sentencizer") | |
| STOPWORDS = nlp.Defaults.stop_words | |
| N = 5 | |
| re_sent_ends_naive = re.compile(r'[.\n]') | |
| re_stripper_naive = re.compile('[^a-zA-Z\.\n]') | |
| splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x)) | |
| # list of tokens for one sentence | |
| def remove_stop_words(text): | |
| result = [] | |
| for w in text: | |
| if w not in STOPWORDS: | |
| result.append(w) | |
| return result | |
| # get sentence from multiple sentences | |
| def parse_sentences(text, nlp): | |
| doc = nlp(text) | |
| sentences = (remove_stop_words(sent) for sent in doc.sents) | |
| return sentences | |
| def get_tuples_manual_sentences(txt, N): | |
| """Naive get tuples that uses periods or newlines to denote sentences.""" | |
| if not txt: | |
| return None, [] | |
| sentences = (x.split() for x in splitter_naive(txt) if x) | |
| sentences = list(map(remove_stop_words, list(sentences))) | |
| # sentences = (remove_stop_words(nlp(x)) for x in splitter_naive(txt) if x) | |
| # sentences = parse_sentences(txt, nlp) | |
| # print(list(sentences)) | |
| ng = (ngrams(x, N) for x in sentences if len(x) >= N) | |
| return sentences, list(chain(*ng)) | |
| def count_by_split(split_data): | |
| c = Counter() | |
| for entry in split_data: | |
| text = entry['text'] | |
| sents, tup = get_tuples_manual_sentences(text, N) | |
| tup = ["_".join(ta) for ta in tup] | |
| c.update(tup) | |
| return c | |
| # data = load_dataset("bigbio/biodatasets/chemdner/chemdner.py", name="chemdner_bigbio_text") | |
| # counters = [] | |
| # for split, split_data in data.items(): | |
| # split_counter = count_by_split(split_data) | |
| # counters.append(split_counter) | |
| # ab_intersect = counters[0] & counters[1] | |
| # diff = {x: count for x, count in counters[0].items() if x not in ab_intersect.keys() and count > 2} | |
| # if len(counters) > 2: | |
| # bc_intersect = counters[1] & counters[2] | |
| # print(ab_intersect.most_common(10)) | |
| # print(Counter(diff).most_common(10)) | |
| # data.cleanup_cache_files() | |