# https://www.kaggle.com/code/nechbamohammed/abstract-clustering-for-scientific-paper-insights
# https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing
!pip install bertopic
!pip install minisom
#from plotly.offline import init_notebook_mode
#init_notebook_mode(connected=True)
import pandas as pd #Data processing, CSV files I/O (e.g. pd.read_csv)
import numpy as np #Linear Algebra: Matrices ...
import matplotlib.pyplot as plt #Data Visualisation
import seaborn as sns
from bertopic import BERTopic
from tqdm import tqdm
# I discoverd that it's possible to download models for the specific purpose to preprocess scientific texts
# In the spacy docs I found a specific model for this : https://spacy.io/universe/project/scispacy
# Downloading en_core_sci_lg model to preprocess abstracts
from IPython.utils import io
with io.capture_output() as captured:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz
#Import NLP librarys and the spacy package to preprocess the abstract text
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #import commen list of stopword
import en_core_sci_lg # import downlaoded model
import string
from minisom import MiniSom
from sklearn.cluster import SpectralClustering
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
df = pd.read_csv("scopus_vitaletti_abstracts.csv")
df.head(5)
Authors | Author full names | Author(s) ID | Title | Year | Volume | Issue | Art. No. | Page start | Page end | Page count | Cited by | DOI | Link | Abstract | Author Keywords | Index Keywords | Document Type | Source | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Becchetti L.; Colesanti U.M.; Marchetti-Spacca... | Becchetti, Luca (56218807100); Colesanti, Ugo ... | 56218807100; 24073192000; 7004071298; 6506463530 | Recommending items in pervasive scenarios: Mod... | 2011 | 28 | 3 | NaN | 555 | 578.0 | 23.0 | 9 | 10.1007/s10115-010-0338-4 | https://www.scopus.com/inward/record.uri?eid=2... | In this paper, we propose and investigate the ... | Collaborative filtering; Decentralized recomme... | Aggregates; Behavioral research; Collaborative... | Article | Scopus |
1 | Chatzigiannakis I.; Mylonas G.; Vitaletti A. | Chatzigiannakis, Ioannis (6602645904); Mylonas... | 6602645904; 36846385500; 6506463530 | Urban pervasive applications: Challenges, scen... | 2011 | 5 | 1 | NaN | 103 | 118.0 | 15.0 | 26 | 10.1016/j.cosrev.2010.09.003 | https://www.scopus.com/inward/record.uri?eid=2... | In this work, we discuss various aspects of th... | Challenges; Games; NFC; Participatory; Pervasi... | NaN | Article | Scopus |
2 | Becchetti L.; Leonardi S.; Marchetti-Spaccamel... | Becchetti, L. (56218807100); Leonardi, S. (563... | 56218807100; 56366507000; 7004071298; 65064635... | Parallel scheduling problems in next generatio... | 2005 | 45 | 1 | NaN | 9 | 22.0 | 13.0 | 1 | 10.1002/net.20045 | https://www.scopus.com/inward/record.uri?eid=2... | Next-generation 3G/4G wireless data networks a... | CDMA; Convex programming; On-line algorithms; ... | Algorithms; Code division multiple access; Opt... | Article | Scopus |
3 | Pennino D.; Pizzonia M.; Vitaletti A.; Zecchin... | Pennino, Diego (57207940361); Pizzonia, Mauriz... | 57207940361; 6603249368; 6506463530; 57215119316 | Blockchain as IoT Economy Enabler: A Review of... | 2022 | 11 | 2 | 20.0 | NaN | NaN | NaN | 19 | 10.3390/jsan11020020 | https://www.scopus.com/inward/record.uri?eid=2... | In the IoT-based economy, a large number of su... | applications of IoT and blockchain; blockchain... | NaN | Review | Scopus |
4 | Santini S.; Roemer K.; Couderc P.; Marrón P.J.... | Santini, Silvia (35303602900); Roemer, Kay (35... | 35303602900; 35867822000; 22333533900; 6603114... | System Architectures and Programming Models | 2010 | NaN | NaN | NaN | 347 | 404.0 | 57.0 | 0 | 10.1002/9780470610817.ch5 | https://www.scopus.com/inward/record.uri?eid=2... | [No abstract available] | Database view; Node internals; Programming mod... | NaN | Book chapter | Scopus |
df
df.info()
df.isna().sum()
my_stopwords = ["abstract","sunrise","available","address","propose","problem","request","code","deluge"]
# Parser
parser = en_core_sci_lg.load()
# parser.max_length = 7000000 #Limit the size of the parser
def spacy_tokenizer(sentence):
''' Function to preprocess text of scientific papers
(e.g Removing Stopword and puntuations)'''
mytokens = parser(sentence)
mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] # transform to lowercase and then split the scentence
mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations and word not in my_stopwords ] #remove stopsword an punctuation
mytokens = " ".join([i for i in mytokens])
return mytokens
punctuations = string.punctuation #list of punctuation to remove from text
stopwords = list(STOP_WORDS)
stopwords[:10]
# the dataframe contains still hugh amount of data. The process the data faster I reduce the df to 10000 rows
# The scope of the notebook is not to analyze all data
# df = df.sample(10000, random_state=42)
tqdm.pandas()
df["processed_text"] = df["Abstract"].progress_apply(spacy_tokenizer)
100%|██████████| 90/90 [00:03<00:00, 23.86it/s]
df['Abstract']
df["processed_text"]
topic_model = BERTopic(verbose=True, embedding_model="paraphrase-MiniLM-L6-v2", min_topic_size=2)
topics, probs = topic_model.fit_transform(df["processed_text"].to_numpy()); len(topic_model.get_topic_info())
2024-06-26 12:54:08,477 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 0%| | 0/3 [00:00<?, ?it/s]
2024-06-26 12:54:15,162 - BERTopic - Embedding - Completed ✓ 2024-06-26 12:54:15,165 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2024-06-26 12:54:17,718 - BERTopic - Dimensionality - Completed ✓ 2024-06-26 12:54:17,720 - BERTopic - Cluster - Start clustering the reduced embeddings 2024-06-26 12:54:17,735 - BERTopic - Cluster - Completed ✓ 2024-06-26 12:54:17,742 - BERTopic - Representation - Extracting topics from clusters using representation models. 2024-06-26 12:54:17,778 - BERTopic - Representation - Completed ✓
14
topic_model.get_topic_info()
Topic | Count | Name | Representation | Representative_Docs | |
---|---|---|---|---|---|
0 | -1 | 5 | -1_gms_maintenance_company_activity | [gms, maintenance, company, activity, smart, c... | [smart city leverage information communication... |
1 | 0 | 17 | 0_sensor_wireless_network_datum | [sensor, wireless, network, datum, node, energ... | [sensor network consist sensing device exchang... |
2 | 1 | 10 | 1_user_information_exchange_use | [user, information, exchange, use, phone, tag,... | [paper fully decentralized approach recommend ... |
3 | 2 | 8 | 2_product_process_design_technology | [product, process, design, technology, art, us... | [gende http://www.gende.it tool allow designer... |
4 | 3 | 8 | 3_protocol_dtn_delay_persistent | [protocol, dtn, delay, persistent, delivery, t... | [paper new taxonomy delay tol- erant network d... |
5 | 4 | 7 | 4_blockchain_decentralized_nft_identity | [blockchain, decentralized, nft, identity, sub... | [application identity management idm use assoc... |
6 | 5 | 7 | 5_datum_iid_clinical_trial | [datum, iid, clinical, trial, learning, privac... | [artificial intelligence-based ai analysis lar... |
7 | 6 | 6 | 6_robinson_list_decentralised_poc | [robinson, list, decentralised, poc, centralis... | [, robinson list protect user privacy frequent... |
8 | 7 | 4 | 7_result_simulator_testbed_infrastructure | [result, simulator, testbed, infrastructure, s... | [present discuss challenge solution pose desig... |
9 | 8 | 4 | 8_signal_plant_electrical_stimulus | [signal, plant, electrical, stimulus, feature,... | [plant electrical signal contain low frequency... |
10 | 9 | 4 | 9_social_population_real_protocols | [social, population, real, protocols, interact... | [paper present experimental analysis assess pe... |
11 | 10 | 4 | 10_algorithm_topology_network_caching | [algorithm, topology, network, caching, node, ... | [study power randomization design online graph... |
12 | 11 | 3 | 11_fragmented_tracking_market_highly | [fragmented, tracking, market, highly, beerlov... | [need blockchain orient software engineering b... |
13 | 12 | 3 | 12_robot_interactive_surveillance_situation | [robot, interactive, surveillance, situation, ... | [paper present smart city architecture develop... |
import plotly.io as pio
pio.renderers.default = "colab"
import plotly.io as pio
pio.renderers.default='notebook'
# https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html
topic_model.visualize_barchart(top_n_topics=3, height=700)
topic_model.visualize_term_rank()
topic_model.visualize_topics(top_n_topics=21)
topic_model.visualize_topics().show()
topic_model.visualize_hierarchy(top_n_topics=21, width=800)
topic_model.visualize_heatmap(n_clusters=5, top_n_topics=21)
# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset
topic_model.visualize_documents(df["Title"])
jq -M 'del(.metadata.widgets)' Abstract_Analysis.ipynb > Abstract_Analysis_new.ipynb
jupyter nbconvert --to html Abstract_Analysis_new.ipynb