# https://www.kaggle.com/code/nechbamohammed/abstract-clustering-for-scientific-paper-insights
# https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing

!pip install bertopic
!pip install minisom


#from plotly.offline import init_notebook_mode
#init_notebook_mode(connected=True)


import pandas as pd                                        #Data processing, CSV files I/O (e.g. pd.read_csv)
import numpy as np                                         #Linear Algebra: Matrices ...
import matplotlib.pyplot as plt                            #Data Visualisation
import seaborn as sns
from bertopic import BERTopic

from tqdm import tqdm
# I discoverd that it's possible to download models for the specific purpose to preprocess scientific texts
# In the spacy docs I found a specific model for this : https://spacy.io/universe/project/scispacy
# Downloading en_core_sci_lg model to preprocess abstracts
from IPython.utils import io
with io.capture_output() as captured:
    !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz


#Import NLP librarys and the spacy package to preprocess the abstract text
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #import commen list of stopword
import en_core_sci_lg  # import downlaoded model
import string
from minisom import MiniSom
from sklearn.cluster import SpectralClustering
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score


df =  pd.read_csv("scopus_vitaletti_abstracts.csv")
df.head(5)

df


df.info()


df.isna().sum()


my_stopwords = ["abstract","sunrise","available","address","propose","problem","request","code","deluge"]


# Parser
parser = en_core_sci_lg.load()
# parser.max_length = 7000000 #Limit the size of the parser

def spacy_tokenizer(sentence):
    ''' Function to preprocess text of scientific papers
        (e.g Removing Stopword and puntuations)'''
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] # transform to lowercase and then split the scentence
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations and word not in my_stopwords ] #remove stopsword an punctuation
    mytokens = " ".join([i for i in mytokens])
    return mytokens


punctuations = string.punctuation #list of punctuation to remove from text
stopwords = list(STOP_WORDS)
stopwords[:10]


# the dataframe contains still hugh amount of data. The process the data faster I reduce the df to 10000 rows
# The scope of the notebook is not to analyze all data
# df = df.sample(10000, random_state=42)


tqdm.pandas()
df["processed_text"] = df["Abstract"].progress_apply(spacy_tokenizer)

100%|██████████| 90/90 [00:03<00:00, 23.86it/s]


df['Abstract']


df["processed_text"]


topic_model = BERTopic(verbose=True, embedding_model="paraphrase-MiniLM-L6-v2", min_topic_size=2)
topics, probs = topic_model.fit_transform(df["processed_text"].to_numpy()); len(topic_model.get_topic_info())

2024-06-26 12:54:08,477 - BERTopic - Embedding - Transforming documents to embeddings.

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-06-26 12:54:15,162 - BERTopic - Embedding - Completed ✓
2024-06-26 12:54:15,165 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-26 12:54:17,718 - BERTopic - Dimensionality - Completed ✓
2024-06-26 12:54:17,720 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-26 12:54:17,735 - BERTopic - Cluster - Completed ✓
2024-06-26 12:54:17,742 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-26 12:54:17,778 - BERTopic - Representation - Completed ✓

14


topic_model.get_topic_info()


import plotly.io as pio
pio.renderers.default = "colab"


import plotly.io as pio
pio.renderers.default='notebook'


# https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html

topic_model.visualize_barchart(top_n_topics=3, height=700)


topic_model.visualize_term_rank()


topic_model.visualize_topics(top_n_topics=21)
topic_model.visualize_topics().show()


topic_model.visualize_hierarchy(top_n_topics=21, width=800)


topic_model.visualize_heatmap(n_clusters=5, top_n_topics=21)


# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset
topic_model.visualize_documents(df["Title"])

	Authors	Author full names	Author(s) ID	Title	Year	Volume	Issue	Art. No.	Page start	Page end	Page count	Cited by	DOI	Link	Abstract	Author Keywords	Index Keywords	Document Type	Source
0	Becchetti L.; Colesanti U.M.; Marchetti-Spacca...	Becchetti, Luca (56218807100); Colesanti, Ugo ...	56218807100; 24073192000; 7004071298; 6506463530	Recommending items in pervasive scenarios: Mod...	2011	28	3	NaN	555	578.0	23.0	9	10.1007/s10115-010-0338-4	https://www.scopus.com/inward/record.uri?eid=2...	In this paper, we propose and investigate the ...	Collaborative filtering; Decentralized recomme...	Aggregates; Behavioral research; Collaborative...	Article	Scopus
1	Chatzigiannakis I.; Mylonas G.; Vitaletti A.	Chatzigiannakis, Ioannis (6602645904); Mylonas...	6602645904; 36846385500; 6506463530	Urban pervasive applications: Challenges, scen...	2011	5	1	NaN	103	118.0	15.0	26	10.1016/j.cosrev.2010.09.003	https://www.scopus.com/inward/record.uri?eid=2...	In this work, we discuss various aspects of th...	Challenges; Games; NFC; Participatory; Pervasi...	NaN	Article	Scopus
2	Becchetti L.; Leonardi S.; Marchetti-Spaccamel...	Becchetti, L. (56218807100); Leonardi, S. (563...	56218807100; 56366507000; 7004071298; 65064635...	Parallel scheduling problems in next generatio...	2005	45	1	NaN	9	22.0	13.0	1	10.1002/net.20045	https://www.scopus.com/inward/record.uri?eid=2...	Next-generation 3G/4G wireless data networks a...	CDMA; Convex programming; On-line algorithms; ...	Algorithms; Code division multiple access; Opt...	Article	Scopus
3	Pennino D.; Pizzonia M.; Vitaletti A.; Zecchin...	Pennino, Diego (57207940361); Pizzonia, Mauriz...	57207940361; 6603249368; 6506463530; 57215119316	Blockchain as IoT Economy Enabler: A Review of...	2022	11	2	20.0	NaN	NaN	NaN	19	10.3390/jsan11020020	https://www.scopus.com/inward/record.uri?eid=2...	In the IoT-based economy, a large number of su...	applications of IoT and blockchain; blockchain...	NaN	Review	Scopus
4	Santini S.; Roemer K.; Couderc P.; Marrón P.J....	Santini, Silvia (35303602900); Roemer, Kay (35...	35303602900; 35867822000; 22333533900; 6603114...	System Architectures and Programming Models	2010	NaN	NaN	NaN	347	404.0	57.0	0	10.1002/9780470610817.ch5	https://www.scopus.com/inward/record.uri?eid=2...	[No abstract available]	Database view; Node internals; Programming mod...	NaN	Book chapter	Scopus

	Topic	Count	Name	Representation	Representative_Docs
0	-1	5	-1_gms_maintenance_company_activity	[gms, maintenance, company, activity, smart, c...	[smart city leverage information communication...
1	0	17	0_sensor_wireless_network_datum	[sensor, wireless, network, datum, node, energ...	[sensor network consist sensing device exchang...
2	1	10	1_user_information_exchange_use	[user, information, exchange, use, phone, tag,...	[paper fully decentralized approach recommend ...
3	2	8	2_product_process_design_technology	[product, process, design, technology, art, us...	[gende http://www.gende.it tool allow designer...
4	3	8	3_protocol_dtn_delay_persistent	[protocol, dtn, delay, persistent, delivery, t...	[paper new taxonomy delay tol- erant network d...
5	4	7	4_blockchain_decentralized_nft_identity	[blockchain, decentralized, nft, identity, sub...	[application identity management idm use assoc...
6	5	7	5_datum_iid_clinical_trial	[datum, iid, clinical, trial, learning, privac...	[artificial intelligence-based ai analysis lar...
7	6	6	6_robinson_list_decentralised_poc	[robinson, list, decentralised, poc, centralis...	[, robinson list protect user privacy frequent...
8	7	4	7_result_simulator_testbed_infrastructure	[result, simulator, testbed, infrastructure, s...	[present discuss challenge solution pose desig...
9	8	4	8_signal_plant_electrical_stimulus	[signal, plant, electrical, stimulus, feature,...	[plant electrical signal contain low frequency...
10	9	4	9_social_population_real_protocols	[social, population, real, protocols, interact...	[paper present experimental analysis assess pe...
11	10	4	10_algorithm_topology_network_caching	[algorithm, topology, network, caching, node, ...	[study power randomization design online graph...
12	11	3	11_fragmented_tracking_market_highly	[fragmented, tracking, market, highly, beerlov...	[need blockchain orient software engineering b...
13	12	3	12_robot_interactive_surveillance_situation	[robot, interactive, surveillance, situation, ...	[paper present smart city architecture develop...