@article{DANGELO2024112162, title = {Uncovering gender gap in academia: A comprehensive analysis within the software engineering community}, journal = {Journal of Systems and Software}, volume = {217}, year = {2024}, issn = {0164-1212}, doi = {https://doi.org/10.1016/j.jss.2024.112162}, file = {jss.pdf}, author = {D’Angelo, Andrea and d’Aloisio, Giordano and Marzi, Francesca and {Di Marco}, Antinisca and Stilo, Giovanni} }
Gender gap in education has gained considerable attention in recent years, as it carries profound implications for the academic community. However, while the problem has been tackled from a student perspective, research is still lacking from an academic point of view. In this work, our main objective is to address this unexplored area by shedding light on the intricate dynamics of gender gap within the Software Engineering (SE) community. To this aim, we first review how the problem of gender gap in the SE community and in academia has been addressed by the literature so far. Results show that men in SE build more tightly-knit clusters but less global co-authorship relations than women, but the networks do not exhibit homophily. Concerning academic promotions, the Software Engineering community presents a higher bias in promotions to Associate Professors and a smaller bias in promotions to Full Professors than the overall Informatics community.
@article{10.1093/nargab/lqae033, author = {Bianchi, Andrea and Zelli, Veronica and D’Angelo, Andrea et al}, title = {A method to comprehensively identify germline SNVs, INDELs and CNVs from whole exome sequencing data of BRCA1/2 negative breast cancer patients}, journal = {NAR Genomics and Bioinformatics}, year = {2024}, issn = {2631-9268}, doi = {10.1093/nargab/lqae033}, file = {nargab.pdf} }
In the rapidly evolving field of genomics, understanding the genetic basis of complex diseases like breast cancer, particularly its familial/hereditary forms, is crucial. Current methods often examine genomic variants—such as Single Nucleotide Variants (SNVs), insertions/deletions (Indels), and Copy Number Variations (CNVs)—separately, lacking an integrated approach. Here, we introduced a robust, flexible methodology for a comprehensive variants’ analysis using Whole Exome Sequencing (WES) data. Our approach uniquely combines meticulous validation with an effective variant filtering strategy. By reanalyzing two germline WES datasets from BRCA1/2 negative breast cancer patients, we demonstrated our tool’s efficiency and adaptability, uncovering both known and novel variants. This contributed new insights for potential diagnostic, preventive, and therapeutic strategies. Our method stands out for its comprehensive inclusion of key genomic variants in a unified analysis, and its practical resolution of technical challenges, offering a pioneering solution in genomic research. This tool presents a breakthrough in providing detailed insights into the genetic alterations in genomes, with significant implications for understanding and managing hereditary breast cancer.
@article{DALOISIO2023103226, title = {Debiaser for Multiple Variables to enhance fairness in classification tasks}, journal = {Information Processing & Management}, volume = {60}, number = {2}, year = {2023}, issn = {0306-4573}, doi = {https://doi.org/10.1016/j.ipm.2022.103226}, file = {demv.pdf}, author = {d’Aloisio, Giordano and D’Angelo, Andrea and {Di Marco}, Antinisca and Stilo, Giovanni} }
Nowadays assuring that search and recommendation systems are fair and do not apply discrimination among any kind of population has become of paramount importance. This is also highlighted by some of the sustainable development goals proposed by the United Nations. Those systems typically rely on machine learning algorithms that solve the classification task. Although the problem of fairness has been widely addressed in binary classification, unfortunately, the fairness of multi-class classification problem needs to be further investigated lacking well-established solutions. For the aforementioned reasons, in this paper, we present the Debiaser for Multiple Variables (DEMV), an approach able to mitigate unbalanced groups bias (i.e., bias caused by an unequal distribution of instances in the population) in both binary and multi-class classification problems with multiple sensitive variables. The proposed method is compared, under several conditions, with a set of well-established baselines using different categories of classifiers. At first we conduct a specific study to understand which is the best generation strategies and their impact on DEMV’s ability to improve fairness. Then, we evaluate our method on a heterogeneous set of datasets and we show how it overcomes the established algorithms of the literature in the multi-class classification setting and in the binary classification setting when more than two sensitive variables are involved. Finally, based on the conducted experiments, we discuss strengths and weaknesses of our method and of the other baselines.
@article{DellaPenna2023, title = {Exploiting spatial relations for grammar-based specification of multidimensional languages}, author = {{Della Penna}, Giuseppe and Orefice, Sergio and D'Angelo, Andrea}, journal = {Knowledge and Information Systems}, year = {2023}, issn = {0219-3116}, doi = {10.1007/s10115-023-01879-6}, file = {grammar_journal.pdf} }
As opposed to textual programming languages, multidimensional languages compiler construction paradigms lack standardization. To this aim, in this paper we present the spatial grammar (SG) formalism, a grammar model for multidimensional languages which has string-like productions containing more general spatial relations other than string concatenation, and we provide mapping rules to translate an SG specification into a translation schema. In this way, the SG formalism inherits and extends to the multidimensional context concepts and techniques of standard compiler generation tools like YACC.
@inproceedings{10.1007/978-3-031-66326-0_6, author = {d'Aloisio, Giordano and D'Angelo, Andrea and Marzi, Francesca and Di Marco, Diana and Stilo, Giovanni and Di Marco, Antinisca}, editor = {Tekinerdo{\u{g}}an, Bedir and Spalazzese, Romina and S{\"o}zer, Hasan and Bonfanti, Silvia and Weyns, Danny}, title = {Data-Driven Analysis of Gender Fairness in the Software Engineering Academic Landscape}, year = {2024}, isbn = {978-3-031-66326-0}, file = {ecsa.pdf} }
Gender bias in education gained considerable relevance in the literature over the years. However, while the problem of gender bias in education has been widely addressed from a student perspective, it is still not fully analysed from an academic point of view. In this work, we study the problem of gender bias in academic promotions (i.e., from Researcher to Associated Professor and from Associated to Full Professor) in the informatics (INF) and software engineering (SE) Italian communities (we restricted to the Italian community since each country has specific and own promotion systems). In particular, we first conduct a literature review to assess how the problem of gender bias in academia has been addressed so far. Next, we describe a process to collect and preprocess the INF and SE data needed to analyse gender bias in Italian academic promotions. Subsequently, we apply a formal bias metric to these data to assess the amount of bias and look at its variation over time. From the conducted analysis, we observe how the SE community presents a higher bias in promotions to Associate Professors and a smaller bias in promotions to Full Professors compared to the overall INF community.
@inproceedings{10.1145/3643991.3644869, author = {D'Angelo, Andrea and Di Sipio, Claudio and Politowski, Cristiano and Rubei, Riccardo}, title = {PlayMyData: a curated dataset of multi-platform video games}, year = {2024}, isbn = {9798400705878}, url = {https://doi.org/10.1145/3643991.3644869}, doi = {10.1145/3643991.3644869}, series = {MSR '24}, file = {playmydata.pdf} }
Being predominant in digital entertainment for decades, video games have been recognized as valuable software artifacts by the software engineering (SE) community just recently. Such an acknowledgment has unveiled several research opportunities, spanning from empirical studies to the application of AI techniques for classification tasks. In this respect, several curated game datasets have been disclosed for research purposes even though the collected data are insufficient to support the application of advanced models or to enable interdisciplinary studies. Moreover, the majority of those are limited to PC games, thus excluding notorious gaming platforms, e.g., PlayStation, Xbox, and Nintendo. In this paper, we propose PlayMyData, a curated dataset composed of 99,864 multi-platform games gathered by the IGDB website. By exploiting a dedicated API, we collect relevant metadata for each game, e.g., description, genre, rating, gameplay video URLs, and screenshots. Furthermore, we enrich PlayMyData with the timing needed to complete each game by mining the HLTB website. To the best of our knowledge, this is the most comprehensive dataset in the domain that can be used to support different automated tasks in SE. More importantly, PlayMyData can be used to foster cross-domain investigations built on top of the provided multimedia data.
@inproceedings{10.1145/3629527.3651844, author = {D'Angelo, Andrea and d'Aloisio, Giordano}, title = {Grammar-Based Anomaly Detection of Microservice Systems Execution Traces}, year = {2024}, url = {https://doi.org/10.1145/3629527.3651844}, doi = {10.1145/3629527.3651844}, series = {ICPE '24 Companion}, file = {grammar_conf.pdf} }
Microservice architectures are a widely adopted architectural pattern for large-scale applications. Given the large adoption of these systems, several works have been proposed to detect performance anomalies starting from analysing the execution traces. However, most of the proposed approaches rely on machine learning (ML) algorithms to detect anomalies. While ML methods may be effective in detecting anomalies, the training and deployment of these systems as been shown to be less efficient in terms of time, computational resources, and energy required.In this paper, we propose a novel approach based on Context-free grammar for anomaly detection of microservice systems execution traces. We employ the SAX encoding to transform execution traces into strings. Then, we select strings encoding anomalies, and for each possible anomaly, we build a Context-free grammar using the Sequitur grammar induction algorithm. We test our approach on two real-world datasets and compare it with a Logistic Regression classifier. We show how our approach is more effective in terms of training time of 15 seconds with a minimum loss in effectiveness of 5% compared to the Logistic Regression baseline.
@inproceedings{Bianchi2022DIORAMA, author = {Bianchi, Andrea and d’Aloisio, Giordano and et al, Andrea D’Angelo}, title = {DIORAMA: Digital twIn fOR sustAinable territorial MAnagement}, booktitle = {Proceedings of the 1st Italian Conference on Big Data and Data Science (ITADATA 2022)}, year = {2022}, series = {CEUR Workshop Proceedings}, url = {https://ceur-ws.org/Vol-3340/paper43.pdf}, file = {diorama.pdf} }
@inproceedings{10.1007/978-3-031-09316-6_11, author = {d'Aloisio, Giordano and Stilo, Giovanni and Di Marco, Antinisca and D'Angelo, Andrea}, title = {Enhancing Fairness in Classification Tasks with Multiple Variables: A Data- and Model-Agnostic Approach}, booktitle = {Advances in Bias and Fairness in Information Retrieval}, year = {2022}, publisher = {Springer International Publishing}, file = {ecsa.pdf} }
Nowadays assuring that search and recommendation systems are fair and do not apply discrimination among any kind of population has become of paramount importance. Those systems typically rely on machine learning algorithms that solve the classification task. Although the problem of fairness has been widely addressed in binary classification, unfortunately, the fairness of multi-class classification problem needs to be further investigated lacking well-established solutions. For the aforementioned reasons, in this paper, we present the Debiaser for Multiple Variables, a novel approach able to enhance fairness in both binary and multi-class classification problems. The proposed method is compared, under several conditions, with the well-established baseline. We evaluate our method on a heterogeneous data set and prove how it overcomes the established algorithms in the multi-classification setting, while maintaining good performances in binary classification. Finally, we present some limitations and future improvements.
@unpublished{report, author = {D'Angelo, Andrea et al}, title = {Report on Female Participation in Informatics degrees in Europe}, year = {2024}, file = {femalereport.pdf} }
This study aims to enrich and leverage data from the Informatics Europe Higher Education (IEHE) data portal to extract and analyze trends in female participation in Informatics across Europe. The research examines the proportion of female students, first-year enrollments, and degrees awarded to women in the field. The issue of low female participation in Informatics has long been recognized as a persistent challenge and remains a critical area of scholarly inquiry. Furthermore, existing literature indicates that socio-economic factors can unpredictably influence female participation, complicating efforts to address the gender gap. The analysis focuses on participation data from research universities at various academic levels, including Bachelors, Masters, and PhD programs, and seeks to uncover potential correlations between female participation and geographical or economic zones. The dataset was first enriched by integrating additional information, such as each country’s GDP and relevant geographical data, sourced from various online repositories. Subsequently, the data was cleaned to ensure consistency and eliminate incomplete time series. A final set of complete time series was selected for further analysis. We then used the data collected from the internet to assign countries to different clusters. Specifically, we employed Economic Zone, Geographical Area, and GDP quartile to cluster countries and compare their temporal trends both within and between clusters. We analyze the results for each classification and derive conclusions based on the available data.
@unpublished{reporu, author = {D'Angelo, Andrea}, title = {A novel Relevance Score for Unsupervised Retrieval with Large Language Models}, note = {Presented at KDD 2024 PhD Consortium}, year = {2024}, file = {kdd_novel_score.pdf} }
Large Language Models (LLMs) have found widespread success in many Natural Language Processing (NLP) tasks. In particular, in unsupervised document retrieval and Retrieval Augmented Generators (RAGs), LLMs are typically employed by pooling their embeddings, resulting in a Relevance Score function defined as the dot product between the mean vectors of a query and a document. However, collapsing the term embeddings into a single sentence embedding may lead to a loss of valuable information, potentially reducing ranking effectiveness. This research proposes DbU-Cloud, a novel density-based method to address these challenges in unsupervised document ranking by eliminating pooling layers from the computation of the Relevance Score for each document, and instead considering a density-based metric derived from outlier detection
@unpublished{francesca_evaluation, author = {Ciccarelli, Francesca and D'Angelo, Andrea and Stilo, Giovanni}, title = {Towards a Novel Visual Evaluation of Algorithmic Bias: Insights on the Italian Academic System}, note = {Presented at KDD 2024 Undergraduate Consortium}, year = {2024}, file = {novel_evaluation.pdf} }
Bias in real-world applications and machine-learning systems leads to inequitable outcomes and perpetuates disparities. Traditional methods often fail to capture bias complexities fully. We introduce a novel use of the ROC curve to analyze classifier performance across subgroups, offering a more detailed understanding of bias. Validated through a case study on the Italian academic system, our approach effectively evaluates gender disparities in career progression. Our contributions include an innovative ROC curve application, practical validation, and a framework for enhancing fairness and inclusivity in decision-making processes.
Andrea D'Angelo
PhD Student in Computer Science
University of L'Aquila
University of L'Aquila
67100 L'Aquila
Italy
© 2025 Andrea D'Angelo