In [ ]:
# https://www.kaggle.com/code/nechbamohammed/abstract-clustering-for-scientific-paper-insights
# https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing

!pip install bertopic
!pip install minisom
In [ ]:
#from plotly.offline import init_notebook_mode
#init_notebook_mode(connected=True)
In [3]:
import pandas as pd                                        #Data processing, CSV files I/O (e.g. pd.read_csv)
import numpy as np                                         #Linear Algebra: Matrices ...
import matplotlib.pyplot as plt                            #Data Visualisation
import seaborn as sns
from bertopic import BERTopic

from tqdm import tqdm
# I discoverd that it's possible to download models for the specific purpose to preprocess scientific texts
# In the spacy docs I found a specific model for this : https://spacy.io/universe/project/scispacy
# Downloading en_core_sci_lg model to preprocess abstracts
from IPython.utils import io
with io.capture_output() as captured:
    !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz
In [7]:
#Import NLP librarys and the spacy package to preprocess the abstract text
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #import commen list of stopword
import en_core_sci_lg  # import downlaoded model
import string
from minisom import MiniSom
from sklearn.cluster import SpectralClustering
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
In [4]:
df =  pd.read_csv("scopus_vitaletti_abstracts.csv")
df.head(5)
Out[4]:
Authors Author full names Author(s) ID Title Year Volume Issue Art. No. Page start Page end Page count Cited by DOI Link Abstract Author Keywords Index Keywords Document Type Source
0 Becchetti L.; Colesanti U.M.; Marchetti-Spacca... Becchetti, Luca (56218807100); Colesanti, Ugo ... 56218807100; 24073192000; 7004071298; 6506463530 Recommending items in pervasive scenarios: Mod... 2011 28 3 NaN 555 578.0 23.0 9 10.1007/s10115-010-0338-4 https://www.scopus.com/inward/record.uri?eid=2... In this paper, we propose and investigate the ... Collaborative filtering; Decentralized recomme... Aggregates; Behavioral research; Collaborative... Article Scopus
1 Chatzigiannakis I.; Mylonas G.; Vitaletti A. Chatzigiannakis, Ioannis (6602645904); Mylonas... 6602645904; 36846385500; 6506463530 Urban pervasive applications: Challenges, scen... 2011 5 1 NaN 103 118.0 15.0 26 10.1016/j.cosrev.2010.09.003 https://www.scopus.com/inward/record.uri?eid=2... In this work, we discuss various aspects of th... Challenges; Games; NFC; Participatory; Pervasi... NaN Article Scopus
2 Becchetti L.; Leonardi S.; Marchetti-Spaccamel... Becchetti, L. (56218807100); Leonardi, S. (563... 56218807100; 56366507000; 7004071298; 65064635... Parallel scheduling problems in next generatio... 2005 45 1 NaN 9 22.0 13.0 1 10.1002/net.20045 https://www.scopus.com/inward/record.uri?eid=2... Next-generation 3G/4G wireless data networks a... CDMA; Convex programming; On-line algorithms; ... Algorithms; Code division multiple access; Opt... Article Scopus
3 Pennino D.; Pizzonia M.; Vitaletti A.; Zecchin... Pennino, Diego (57207940361); Pizzonia, Mauriz... 57207940361; 6603249368; 6506463530; 57215119316 Blockchain as IoT Economy Enabler: A Review of... 2022 11 2 20.0 NaN NaN NaN 19 10.3390/jsan11020020 https://www.scopus.com/inward/record.uri?eid=2... In the IoT-based economy, a large number of su... applications of IoT and blockchain; blockchain... NaN Review Scopus
4 Santini S.; Roemer K.; Couderc P.; Marrón P.J.... Santini, Silvia (35303602900); Roemer, Kay (35... 35303602900; 35867822000; 22333533900; 6603114... System Architectures and Programming Models 2010 NaN NaN NaN 347 404.0 57.0 0 10.1002/9780470610817.ch5 https://www.scopus.com/inward/record.uri?eid=2... [No abstract available] Database view; Node internals; Programming mod... NaN Book chapter Scopus
In [ ]:
df
In [ ]:
df.info()
In [ ]:
df.isna().sum()
In [92]:
my_stopwords = ["abstract","sunrise","available","address","propose","problem","request","code","deluge"]
In [93]:
# Parser
parser = en_core_sci_lg.load()
# parser.max_length = 7000000 #Limit the size of the parser

def spacy_tokenizer(sentence):
    ''' Function to preprocess text of scientific papers
        (e.g Removing Stopword and puntuations)'''
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] # transform to lowercase and then split the scentence
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations and word not in my_stopwords ] #remove stopsword an punctuation
    mytokens = " ".join([i for i in mytokens])
    return mytokens
In [ ]:
punctuations = string.punctuation #list of punctuation to remove from text
stopwords = list(STOP_WORDS)
stopwords[:10]
In [ ]:
# the dataframe contains still hugh amount of data. The process the data faster I reduce the df to 10000 rows
# The scope of the notebook is not to analyze all data
# df = df.sample(10000, random_state=42)
In [94]:
tqdm.pandas()
df["processed_text"] = df["Abstract"].progress_apply(spacy_tokenizer)
100%|██████████| 90/90 [00:03<00:00, 23.86it/s]
In [ ]:
df['Abstract']
In [ ]:
df["processed_text"]
In [119]:
topic_model = BERTopic(verbose=True, embedding_model="paraphrase-MiniLM-L6-v2", min_topic_size=2)
topics, probs = topic_model.fit_transform(df["processed_text"].to_numpy()); len(topic_model.get_topic_info())
2024-06-26 12:54:08,477 - BERTopic - Embedding - Transforming documents to embeddings.
Batches:   0%|          | 0/3 [00:00<?, ?it/s]
2024-06-26 12:54:15,162 - BERTopic - Embedding - Completed ✓
2024-06-26 12:54:15,165 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-26 12:54:17,718 - BERTopic - Dimensionality - Completed ✓
2024-06-26 12:54:17,720 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-26 12:54:17,735 - BERTopic - Cluster - Completed ✓
2024-06-26 12:54:17,742 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-26 12:54:17,778 - BERTopic - Representation - Completed ✓
Out[119]:
14
In [120]:
topic_model.get_topic_info()
Out[120]:
Topic Count Name Representation Representative_Docs
0 -1 5 -1_gms_maintenance_company_activity [gms, maintenance, company, activity, smart, c... [smart city leverage information communication...
1 0 17 0_sensor_wireless_network_datum [sensor, wireless, network, datum, node, energ... [sensor network consist sensing device exchang...
2 1 10 1_user_information_exchange_use [user, information, exchange, use, phone, tag,... [paper fully decentralized approach recommend ...
3 2 8 2_product_process_design_technology [product, process, design, technology, art, us... [gende http://www.gende.it tool allow designer...
4 3 8 3_protocol_dtn_delay_persistent [protocol, dtn, delay, persistent, delivery, t... [paper new taxonomy delay tol- erant network d...
5 4 7 4_blockchain_decentralized_nft_identity [blockchain, decentralized, nft, identity, sub... [application identity management idm use assoc...
6 5 7 5_datum_iid_clinical_trial [datum, iid, clinical, trial, learning, privac... [artificial intelligence-based ai analysis lar...
7 6 6 6_robinson_list_decentralised_poc [robinson, list, decentralised, poc, centralis... [, robinson list protect user privacy frequent...
8 7 4 7_result_simulator_testbed_infrastructure [result, simulator, testbed, infrastructure, s... [present discuss challenge solution pose desig...
9 8 4 8_signal_plant_electrical_stimulus [signal, plant, electrical, stimulus, feature,... [plant electrical signal contain low frequency...
10 9 4 9_social_population_real_protocols [social, population, real, protocols, interact... [paper present experimental analysis assess pe...
11 10 4 10_algorithm_topology_network_caching [algorithm, topology, network, caching, node, ... [study power randomization design online graph...
12 11 3 11_fragmented_tracking_market_highly [fragmented, tracking, market, highly, beerlov... [need blockchain orient software engineering b...
13 12 3 12_robot_interactive_surveillance_situation [robot, interactive, surveillance, situation, ... [paper present smart city architecture develop...
In [121]:
import plotly.io as pio
pio.renderers.default = "colab"
In [122]:
import plotly.io as pio
pio.renderers.default='notebook'
In [123]:
# https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html

topic_model.visualize_barchart(top_n_topics=3, height=700)
In [124]:
topic_model.visualize_term_rank()
In [125]:
topic_model.visualize_topics(top_n_topics=21)
topic_model.visualize_topics().show()
In [126]:
topic_model.visualize_hierarchy(top_n_topics=21, width=800)
In [127]:
topic_model.visualize_heatmap(n_clusters=5, top_n_topics=21)
In [128]:
# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset
topic_model.visualize_documents(df["Title"])

jq -M 'del(.metadata.widgets)' Abstract_Analysis.ipynb > Abstract_Analysis_new.ipynb

jupyter nbconvert --to html Abstract_Analysis_new.ipynb