Spaces:
Sleeping
Sleeping
| import pandas | |
| import nltk | |
| nltk.download('wordnet') | |
| # load the dataset | |
| dataset = pandas.read_csv('covid_abstracts.csv') | |
| dataset.head() | |
| #Fetch wordcount for each abstract | |
| dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" "))) | |
| dataset[['title','word_count']].head() | |
| ##Descriptive statistics of word counts | |
| dataset.word_count.describe() | |
| #Identify common words | |
| freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20] | |
| #freq = pandas.Series(' '.join(dataset['title']).split()).value_counts()[:20] | |
| freq | |
| #Identify uncommon words | |
| freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:] | |
| #freq1 = pandas.Series(' '.join(dataset | |
| # ['title']).split()).value_counts()[-20:] | |
| freq1 | |
| from nltk.stem.porter import PorterStemmer | |
| from nltk.stem.wordnet import WordNetLemmatizer | |
| lem = WordNetLemmatizer() | |
| stem = PorterStemmer() | |
| word = "cryptogenic" | |
| print("stemming:",stem.stem(word)) | |
| print("lemmatization:", lem.lemmatize(word, "v")) | |
| import nltk | |
| nltk.download('wordnet') | |
| # Libraries for text preprocessing | |
| import re | |
| import nltk | |
| nltk.download('stopwords') | |
| from nltk.corpus import stopwords | |
| from nltk.stem.porter import PorterStemmer | |
| from nltk.tokenize import RegexpTokenizer | |
| #nltk.download('wordnet') | |
| from nltk.stem.wordnet import WordNetLemmatizer | |
| ##Creating a list of stop words and adding custom stopwords | |
| stop_words = set(stopwords.words("english")) | |
| ##Creating a list of custom stopwords | |
| new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"] | |
| stop_words = stop_words.union(new_words) | |
| print(stop_words) | |
| print(new_words) | |
| corpus = [] | |
| for i in range(0, 3847): | |
| #Remove punctuations | |
| text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i]) | |
| #Convert to lowercase | |
| text = text.lower() | |
| #remove tags | |
| text=re.sub("</?.*?>"," <> ",text) | |
| # remove special characters and digits | |
| text=re.sub("(\\d|\\W)+"," ",text) | |
| ##Convert to list from string | |
| text = text.split() | |
| ##Stemming | |
| ps=PorterStemmer() | |
| #Lemmatisation | |
| lem = WordNetLemmatizer() | |
| text = [lem.lemmatize(word) for word in text if not word in | |
| stop_words] | |
| text = " ".join(text) | |
| corpus.append(text) | |
| #View corpus item | |
| corpus[222] | |
| #View corpus item | |
| corpus[300] | |
| #Word cloud | |
| from os import path | |
| from PIL import Image | |
| from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator | |
| import matplotlib.pyplot as plt | |
| wordcloud = WordCloud( | |
| background_color='white', | |
| stopwords=stop_words, | |
| max_words=100, | |
| max_font_size=50, | |
| random_state=42 | |
| ).generate(str(corpus)) | |
| print(wordcloud) | |
| fig = plt.figure(1) | |
| plt.imshow(wordcloud) | |
| plt.axis('off') | |
| plt.show() | |
| fig.savefig("word1.png", dpi=900) | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| import re | |
| # Assuming you have the 'corpus' defined | |
| # and 'stop_words' defined as in your previous code | |
| # Create a CountVectorizer with predefined English stop words | |
| cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3)) | |
| X = cv.fit_transform(corpus) | |
| # Alternatively, use your custom stop words | |
| custom_stop_words = ['same', 'hers', 'they', 'with', 'if', 'y', 'iv', 'new', ...] # Add your custom stop words | |
| cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1, 3)) | |
| X = cv.fit_transform(corpus) | |
| #from sklearn.feature_extraction.text import CountVectorizer | |
| #import re | |
| #cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3)) | |
| #X=cv.fit_transform(corpus) | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1,3)) | |
| X = cv.fit_transform(corpus) | |
| custom_stop_words = ['from', 'to', 'against', 'each', 'own', ...] # Add your custom stop words | |
| cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1,3)) | |
| X = cv.fit_transform(corpus) | |
| list(cv.vocabulary_.keys())[:10] | |
| #Most frequently occuring words | |
| def get_top_n_words(corpus, n=None): | |
| vec = CountVectorizer().fit(corpus) | |
| bag_of_words = vec.transform(corpus) | |
| sum_words = bag_of_words.sum(axis=0) | |
| words_freq = [(word, sum_words[0, idx]) for word, idx in | |
| vec.vocabulary_.items()] | |
| words_freq =sorted(words_freq, key = lambda x: x[1], | |
| reverse=True) | |
| return words_freq[:n] | |
| #Convert most freq words to dataframe for plotting bar plot | |
| top_words = get_top_n_words(corpus, n=20) | |
| top_df = pandas.DataFrame(top_words) | |
| top_df.columns=["Word", "Freq"] | |
| #Barplot of most freq words | |
| import seaborn as sns | |
| sns.set(rc={'figure.figsize':(13,8)}) | |
| g = sns.barplot(x="Word", y="Freq", data=top_df) | |
| g.set_xticklabels(g.get_xticklabels(), rotation=30) | |
| #Most frequently occuring Bi-grams | |
| def get_top_n2_words(corpus, n=None): | |
| vec1 = CountVectorizer(ngram_range=(2,2), | |
| max_features=2000).fit(corpus) | |
| bag_of_words = vec1.transform(corpus) | |
| sum_words = bag_of_words.sum(axis=0) | |
| words_freq = [(word, sum_words[0, idx]) for word, idx in | |
| vec1.vocabulary_.items()] | |
| words_freq =sorted(words_freq, key = lambda x: x[1], | |
| reverse=True) | |
| return words_freq[:n] | |
| top2_words = get_top_n2_words(corpus, n=20) | |
| top2_df = pandas.DataFrame(top2_words) | |
| top2_df.columns=["Bi-gram", "Freq"] | |
| print(top2_df) | |
| #Barplot of most freq Bi-grams | |
| import seaborn as sns | |
| sns.set(rc={'figure.figsize':(13,8)}) | |
| h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df) | |
| h.set_xticklabels(h.get_xticklabels(), rotation=45) | |
| #Most frequently occuring Tri-grams | |
| def get_top_n3_words(corpus, n=None): | |
| vec1 = CountVectorizer(ngram_range=(3,3), | |
| max_features=2000).fit(corpus) | |
| bag_of_words = vec1.transform(corpus) | |
| sum_words = bag_of_words.sum(axis=0) | |
| words_freq = [(word, sum_words[0, idx]) for word, idx in | |
| vec1.vocabulary_.items()] | |
| words_freq =sorted(words_freq, key = lambda x: x[1], | |
| reverse=True) | |
| return words_freq[:n] | |
| top3_words = get_top_n3_words(corpus, n=20) | |
| top3_df = pandas.DataFrame(top3_words) | |
| top3_df.columns=["Tri-gram", "Freq"] | |
| print(top3_df) | |
| #Barplot of most freq Tri-grams | |
| import seaborn as sns | |
| sns.set(rc={'figure.figsize':(13,8)}) | |
| j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df) | |
| j.set_xticklabels(j.get_xticklabels(), rotation=45) | |
| from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer | |
| # Assuming you already have the 'corpus' defined | |
| # Create a CountVectorizer | |
| cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3)) | |
| # Fit and transform the corpus | |
| X = cv.fit_transform(corpus) | |
| # Create a TfidfTransformer and fit it to the CountVectorizer output | |
| tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) | |
| tfidf_transformer.fit(X) | |
| # Get feature names from CountVectorizer | |
| feature_names = cv.get_feature_names_out() | |
| # Fetch document for which keywords need to be extracted | |
| doc = corpus[82] | |
| # Generate tf-idf for the given document | |
| tf_idf_vector = tfidf_transformer.transform(cv.transform([doc])) | |
| # Now you can proceed with your further code | |
| #Function for sorting tf_idf in descending order | |
| from scipy.sparse import coo_matrix | |
| def sort_coo(coo_matrix): | |
| tuples = zip(coo_matrix.col, coo_matrix.data) | |
| return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) | |
| def extract_topn_from_vector(feature_names, sorted_items, topn=10): | |
| """get the feature names and tf-idf score of top n items""" | |
| #use only top n items from vector | |
| sorted_items = sorted_items[:topn] | |
| score_vals = [] | |
| feature_vals = [] | |
| # word index and corresponding tf-idf score | |
| for idx, score in sorted_items: | |
| #keep track of feature name and its corresponding score | |
| score_vals.append(round(score, 3)) | |
| feature_vals.append(feature_names[idx]) | |
| #create a tuples of feature,score | |
| #results = zip(feature_vals,score_vals) | |
| results= {} | |
| for idx in range(len(feature_vals)): | |
| results[feature_vals[idx]]=score_vals[idx] | |
| return results | |
| #sort the tf-idf vectors by descending order of scores | |
| sorted_items=sort_coo(tf_idf_vector.tocoo()) | |
| #extract only the top n; n here is 10 | |
| keywords=extract_topn_from_vector(feature_names,sorted_items,10) | |
| # now print the results | |
| print("\nAbstract:") | |
| print(doc) | |
| print("\nKeywords:") | |
| for k in keywords: | |
| print(k,keywords[k]) | |
| from gensim.models import word2vec | |
| tokenized_sentences = [sentence.split() for sentence in corpus] | |
| model = word2vec.Word2Vec(tokenized_sentences, min_count=1) | |
| model.wv.most_similar(positive=["incidence"]) | |
| import nltk | |
| #nltk.download('omw-1.4') | |
| from nltk.corpus import wordnet as wn | |
| wn.synsets('car') | |
| wn.synset('car.n.01').definition() | |
| import gradio as gr | |
| from nltk.corpus import wordnet as wn | |
| # Function to get the definition of the first synset for a given word | |
| def get_synset_definition(word): | |
| synsets = wn.synsets(word) | |
| if synsets: | |
| first_synset = synsets[0] | |
| return first_synset.definition() | |
| else: | |
| return "No synsets found for the given word." | |
| # Gradio Interface | |
| iface = gr.Interface( | |
| fn=get_synset_definition, | |
| inputs=gr.Textbox(), | |
| outputs=gr.Textbox(), | |
| live=True, | |
| title="Key Extraction", | |
| description="Enter a word to get the definition of its first WordNet synset.", | |
| ) | |
| # Launch the Gradio interface | |
| iface.launch() |