def summarize(text, n):
sents = sent_tokenize(text) # text into tokenized sentences
# Checking if there are less sentences in the given review than the required length of the summary
assert n <= len(sents)
list_sentences = [word_tokenize(s.lower()) for s in sents] # word tokenized sentences
frequency = calculate_freq(list_sentences) # calculating the word frequency for all the sentences
ranking = defaultdict(int)
for i, sent in enumerate(list_sentences):
for w in sent:
if w in frequency:
ranking[i] += frequency[w]
# Calling the rank function to get the highest ranking
sents_idx = rank(ranking, n)
# Return the best choices
return [sents[j] for j in sents_idx]
data = pd.read_csv('dataframe.csv')
text = data.iloc[:,2] # ilocating the texts
list_of_strings = []
for t in text:
list_of_strings.append(t) # creating a list of all the texts
our_summary = []
for s in list_of_strings:
for f in summarize(s, 1):
ours = pd.DataFrame({"our_summary": our_summary})
EDIT: 、他の2つの関数は、次のとおりです。
def calculate_freq(list_sentences):
frequency = defaultdict(int)
for sentence in list_sentences:
for word in sentence:
if word not in our_stopwords:
frequency[word] += 1
# We want to filter out the words with frequency below 0.1 or above 0.9 (once normalized)
if frequency.values():
max_word = float(max(frequency.values()))
max_word = 1
for w in frequency.keys():
frequency[w] = frequency[w]/max_word # normalize
if frequency[w] <= min_freq or frequency[w] >= max_freq:
del frequency[w] # filter
return frequency
def rank(ranking, n):
# return n first sentences with highest ranking
return nlargest(n, ranking, key=ranking.get)
入力テキスト:レシピは簡単で、犬はそれらを愛します。私はこの本を何度も買うだろう。唯一のことは、レシピではどれくらいのおやつがあるのかを教えてくれないということですが、それはあなたがそれらをすべて異なるサイズにすることができるからです。素晴らしい買い物! 出力テキスト:私はこの本を何度も買うだろう。
このコードの代わりに、テキストと予想される出力でデータを入力できますか? –
pandas.DataFrame.applyを見てみるとよいでしょう。 –
'summarize()'が別の関数を呼び出しています。このための入力と出力の例を含めることができますか? – roganjosh