@inproceedings{6275a02d2ea94a77b0880668092414c6,
title = "Dual Representation Learning From Fetal Ultrasound Video and Sonographer Audio",
abstract = "This paper tackles the challenging problem of real-world data self-supervised representation learning from two modalities: fetal ultrasound (US) video and the corresponding speech acquired when a sonographer performs a pregnancy scan. We propose to transfer knowledge between the different modalities, even though the sonographer's speech and the US video may not be semantically correlated. We design a network architecture capable of learning useful representations such as of anatomical features and structures while recognising the correlation between an US video scan and the sonographer's speech. We introduce dual representation learning from US video and audio, which consists of two concepts: Multi-Modal Contrastive Learning and Multi-Modal Similarity Learning, in a latent feature space. Experiments show that the proposed architecture learns powerful representations and transfers well for two downstream tasks. Furthermore, we experiment with two different datasets for pretraining which differ in size and length of video clips (as well as sonographer speech) to show that the quality of the sonographer's speech plays an important role in the final performance.",
keywords = "Multi-Modal, Self-Supervised, Ultrasound",
author = "Mourad Gridach and Mohammad Alsharid and Jianbo Jiao and Lior Drukker and Papageorghiou, \{Aris T.\} and \{Alison Noble\}, J.",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 21st IEEE International Symposium on Biomedical Imaging, ISBI 2024 ; Conference date: 27-05-2024 Through 30-05-2024",
year = "2024",
doi = "10.1109/ISBI56570.2024.10635693",
language = "British English",
series = "Proceedings - International Symposium on Biomedical Imaging",
publisher = "IEEE Computer Society",
booktitle = "IEEE International Symposium on Biomedical Imaging, ISBI 2024 - Conference Proceedings",
address = "United States",
}