site/datasets/citations/voxceleb2.json


1

{"id": "8875ae233bc074f5cd6c4ebba447b536a7e847a5", "paper": {"key": "voxceleb2", "name": "VoxCeleb2", "title": "VoxCeleb2: Deep Speaker Recognition.", "year": "2018", "addresses": []}, "citations": [{"id": "9461ae046dbbafbad095bbbc80d0b9e5931f6a72", "title": "Linkage Based Face Clustering via Graph Convolution Network", "addresses": [{"name": "Tsinghua University", "source_name": "Tsinghua University", "street_adddress": "\u6e05\u534e\u5927\u5b66, 30, \u53cc\u6e05\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100084, \u4e2d\u56fd", "lat": "40.00229045", "lng": "116.32098908", "type": "edu", "country": "China"}, {"name": "Australian National University", "source_name": "Australian National University", "street_adddress": "Canberra ACT 0200, Australia", "lat": "-35.27769990", "lng": "149.11852700", "type": "edu", "country": "Australia"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1903.11306.pdf"], "doi": []}, {"id": "827fdecf6a292cefb21837b9d11533a0e40f9e08", "title": "The Conversation: Deep Audio-Visual Speech Enhancement", "addresses": [{"name": "University of Oxford", "source_name": "University of Oxford", "street_adddress": "Radcliffe Camera, Radcliffe Square, Grandpont, Oxford, Oxon, South East, England, OX1 4AJ, UK", "lat": "51.75345380", "lng": "-1.25400997", "type": "edu", "country": "United Kingdom"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1804.04121.pdf"], "doi": []}, {"id": "073e98d1a443e7b5f8b65903f18cd05a2b884400", "title": "Unsupervised Learning of Object Landmarks through Conditional Image Generation", "addresses": [{"name": "University of Oxford", "source_name": "University of Oxford", "street_adddress": "Radcliffe Camera, Radcliffe Square, Grandpont, Oxford, Oxon, South East, England, OX1 4AJ, UK", "lat": "51.75345380", "lng": "-1.25400997", "type": "edu", "country": "United Kingdom"}, {"name": "University of Edinburgh", "source_name": "University of Edinburgh", "street_adddress": "New College, New College Courtyard, The Mound, Old Town, Edinburgh, City of Edinburgh, Scotland, EH1 2LX, UK", "lat": "55.94951105", "lng": "-3.19534913", "type": "edu", "country": "United Kingdom"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1806.07823.pdf"], "doi": []}, {"id": "a5985dfb2f90cd34d83613b4872ac72b298a876e", "title": "VoiceFilter: Targeted Voice Separation by Speaker-Conditioned Spectrogram Masking", "addresses": [{"name": "Google", "source_name": "Google, Inc.", "street_adddress": "1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA", "lat": "37.42199990", "lng": "-122.08405750", "type": "company", "country": "United States"}, {"name": "IDIAP Research Institute", "source_name": "IDIAP Research Institute", "street_adddress": "Idiap Research Institute, Parking Centre du parc, Martigny, Valais/Wallis, 1920, Schweiz/Suisse/Svizzera/Svizra", "lat": "46.10923700", "lng": "7.08453549", "type": "edu", "country": "Switzerland"}, {"name": "EPFL", "source_name": "EPFL", "street_adddress": "Route Cantonale, 1015 Lausanne, Switzerland", "lat": "46.51905570", "lng": "6.56675760", "type": "edu", "country": "Switzerland"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1810.04826.pdf"], "doi": []}, {"id": "4519b43e0d22934a2a3035d45f7626fe2e77643a", "title": "The VOiCES from a Distance Challenge 2019 Evaluation Plan", "addresses": [{"name": "SRI International", "source_name": "SRI International", "street_adddress": "SRI International Building, West 1st Street, Menlo Park, San Mateo County, California, 94025, USA", "lat": "37.45857960", "lng": "-122.17560525", "type": "edu", "country": "United States"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1902.10828.pdf"], "doi": []}, {"id": "54eef13b3fe487cc48a74faf8f486a312f545cc9", "title": "Self-supervised speaker embeddings", "addresses": [{"name": "Brno University of Technology", "source_name": "Brno University of Technology", "street_adddress": "1 548 Anton\u00ednsk\u00e1 Brno-st\u0159ed Brno \u010cesk\u00e1 republika, 601 90, Czechia", "lat": "49.20172000", "lng": "16.60331680", "type": "edu", "country": "Czech Republic"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1904.03486.pdf"], "doi": []}, {"id": "56a01913e58f721e38b5ddf6b30a588c8d4dfc66", "title": "Noise Robust Speaker Recognition Based on Adaptive Frame Weighting in GMM for i-Vector Extraction", "addresses": [{"name": "Tsinghua University", "source_name": "Tsinghua University", "street_adddress": "\u6e05\u534e\u5927\u5b66, 30, \u53cc\u6e05\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100084, \u4e2d\u56fd", "lat": "40.00229045", "lng": "116.32098908", "type": "edu", "country": "China"}], "year": "2019", "pdf": [], "doi": ["http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8653291"]}, {"id": "171f8f1090ef0533ff470ed5a4d31ecfefcc74be", "title": "Audio-Visual Scene Analysis with Self-Supervised Multisensory Features", "addresses": [{"name": "UC Berkeley", "source_name": "UC Berkeley", "street_adddress": "Berkeley, CA, USA", "lat": "37.87189920", "lng": "-122.25853990", "type": "edu", "country": "United States"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1804.03641.pdf"], "doi": []}, {"id": "acc405806229912e9723c55bf61dc1d34059f5d1", "title": "Few Shot Speaker Recognition using Deep Neural Networks", "addresses": [{"name": "Indian Institute of Technology Delhi, India", "source_name": "Indian Institute of Technology Delhi, India", "street_adddress": "IIT Campus, Hauz Khas, New Delhi, Delhi 110016, India", "lat": "28.54497560", "lng": "77.19262840", "type": "edu", "country": "India"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1904.08775.pdf"], "doi": []}, {"id": "a9cbd487b394d94d5f303c91d7f14dae14f70acf", "title": "Can We Use Speaker Recognition Technology to Attack Itself? Enhancing Mimicry Attacks Using Automatic Target Speaker Selection", "addresses": [{"name": "University of Eastern Finland", "source_name": "University of Eastern Finland, Joensuu, Finland", "street_adddress": "Yliopistokatu 7, 80130 Joensuu, Finland", "lat": "62.60539600", "lng": "29.74053700", "type": "edu", "country": "Finland"}, {"name": "INRIA, France", "source_name": "INRIA, France", "street_adddress": "Institut National de Recherche en Informatique et en Automatique, 54600 Villers-l\u00e8s-Nancy, France", "lat": "48.66544710", "lng": "6.15702390", "type": "edu", "country": "France"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1811.03790.pdf"], "doi": []}, {"id": "b808cfac9c44f27d3716f9280dad4dc2a9bbc8df", "title": "FOR MULTI-SPEAKER CONVERSATIONS USING X-VECTORS", "addresses": [{"name": "Johns Hopkins University", "source_name": "Johns Hopkins University", "street_adddress": "Baltimore, MD 21218, USA", "lat": "39.32990130", "lng": "-76.62051770", "type": "edu", "country": "United States"}], "year": "2018", "pdf": ["https://pdfs.semanticscholar.org/b808/cfac9c44f27d3716f9280dad4dc2a9bbc8df.pdf"], "doi": []}, {"id": "35a2ee8bb43ad14b966c294f599475c1edd5213e", "title": "Centroid-based deep metric learning for speaker recognition", "addresses": [{"name": "University of Toronto", "source_name": "University of Toronto", "street_adddress": "University of Toronto, St. George Street, Bloor Street Culture Corridor, Old Toronto, Toronto, Ontario, M5S 1A5, Canada", "lat": "43.66333345", "lng": "-79.39769975", "type": "edu", "country": "Canada"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1902.02375.pdf"], "doi": []}, {"id": "2c75e7ee01b362cc5f1ef2e49a84d56af93e6a3d", "title": "Training Speaker Recognition Models with Recording-Level Labels", "addresses": [{"name": "Tallinn University of Technology", "source_name": "Institute of Cybernetics, Tallinn University of Technology", "street_adddress": "Akadeemia tee 21, 12618 Tallinn, Estonia", "lat": "59.39791420", "lng": "24.66085570", "type": "edu", "country": "Estonia"}], "year": "2018", "pdf": [], "doi": ["http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8639601"]}, {"id": "f2640379adc41360adc1ff03c851b30e509e41b8", "title": "End-to-end losses based on speaker basis vectors and all-speaker hard negative mining for speaker verification", "addresses": [{"name": "University of Seoul", "source_name": "University of Seoul, Seoul, Korea", "street_adddress": "163 Seoulsiripdae-ro, Jeonnong 2(i)-dong, Dongdaemun-gu, Seoul, South Korea", "lat": "37.58386570", "lng": "127.05877710", "type": "edu", "country": "South Korea"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1902.02455.pdf"], "doi": []}, {"id": "0e8cd058ae29c6f60a8750c1df3caa5dc0e99543", "title": "You Said That?: Synthesising Talking Faces from Audio", "addresses": [{"name": "University of Oxford", "source_name": "University of Oxford", "street_adddress": "Radcliffe Camera, Radcliffe Square, Grandpont, Oxford, Oxon, South East, England, OX1 4AJ, UK", "lat": "51.75345380", "lng": "-1.25400997", "type": "edu", "country": "United Kingdom"}], "year": "2019", "pdf": ["http://www.robots.ox.ac.uk/~vgg/publications/2019/Jamaludin19/jamaludin19.pdf"], "doi": ["https://doi.org/10.1007/s11263-019-01150-y"]}, {"id": "fa62bd7c8e29a7f2c2104f7b769b487ab9dad4fb", "title": "Deep Neural Network Embedding Learning with High-Order Statistics for Text-Independent Speaker Verification", "addresses": [{"name": "University of Science and Technology of China", "source_name": "University of Science and Technology of China", "street_adddress": "\u4e2d\u56fd\u79d1\u5b66\u6280\u672f\u5927\u5b66 \u4e1c\u6821\u533a, 96\u53f7, \u91d1\u5be8\u8def, \u6c5f\u6dee\u5316\u80a5\u5382\u5c0f\u533a, \u829c\u6e56\u8def\u8857\u9053, \u5408\u80a5\u5e02\u533a, \u5408\u80a5\u5e02, \u5b89\u5fbd\u7701, 230026, \u4e2d\u56fd", "lat": "31.83907195", "lng": "117.26420748", "type": "edu", "country": "China"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1903.12058.pdf"], "doi": []}, {"id": "37db8a78a6bb47f6de37f5e42f6df5cd7b1bd304", "title": "Who Do I Sound Like? Showcasing Speaker Recognition Technology by YouTube Voice Search", "addresses": [{"name": "University of Eastern Finland", "source_name": "University of Eastern Finland, Joensuu, Finland", "street_adddress": "Yliopistokatu 7, 80130 Joensuu, Finland", "lat": "62.60539600", "lng": "29.74053700", "type": "edu", "country": "Finland"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1811.03293.pdf"], "doi": []}, {"id": "78efccbfd1b1fca267c1b7903d3e4344c9d54ce3", "title": "Symbolic Tensor Neural Networks for Digital Media - from Tensor Processing via BNF Graph Rules to CREAMS Applications", "addresses": [{"name": "Warsaw University of Technology", "source_name": "Warsaw University of Technology", "street_adddress": "Politechnika Warszawska, 1, Plac Politechniki, VIII, \u015ar\u00f3dmie\u015bcie, Warszawa, mazowieckie, 00-661, RP", "lat": "52.22165395", "lng": "21.00735776", "type": "edu", "country": "Poland"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1809.06582.pdf"], "doi": []}, {"id": "65a9d4b8740b7ae48127d7eae9443086613a83a7", "title": "FML: Face Model Learning from Videos", "addresses": [{"name": "Stanford University", "source_name": "Stanford University", "street_adddress": "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "lat": "37.43131385", "lng": "-122.16936535", "type": "edu", "country": "United States"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1812.07603.pdf"], "doi": []}, {"id": "4da8fe0379af893cca721276e13db3622955b3e7", "title": "Fully Supervised Speaker Diarization", "addresses": [{"name": "Google", "source_name": "Google, Inc.", "street_adddress": "1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA", "lat": "37.42199990", "lng": "-122.08405750", "type": "company", "country": "United States"}, {"name": "Columbia University", "source_name": "Columbia University", "street_adddress": "Columbia University Medical Center, 630, West 168th Street, Washington Heights, Manhattan, Manhattan Community Board 12, New York County, NYC, New York, 10031, USA", "lat": "40.84198360", "lng": "-73.94368971", "type": "edu", "country": "United States"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1810.04719.pdf"], "doi": []}, {"id": "5b65716709a2a7a4da2e2aeb611f82e7aacfbbf0", "title": "Large Margin Softmax Loss for Speaker Verification", "addresses": [{"name": "Tsinghua University", "source_name": "Tsinghua University", "street_adddress": "\u6e05\u534e\u5927\u5b66, 30, \u53cc\u6e05\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100084, \u4e2d\u56fd", "lat": "40.00229045", "lng": "116.32098908", "type": "edu", "country": "China"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1904.03479.pdf"], "doi": []}, {"id": "70cfbf82cee2f007b3c65cf97373f66f13005b42", "title": "Multi-Task Learning with High-Order Statistics for X-vector based Text-Independent Speaker Verification", "addresses": [{"name": "University of Science and Technology of China", "source_name": "University of Science and Technology of China", "street_adddress": "\u4e2d\u56fd\u79d1\u5b66\u6280\u672f\u5927\u5b66 \u4e1c\u6821\u533a, 96\u53f7, \u91d1\u5be8\u8def, \u6c5f\u6dee\u5316\u80a5\u5382\u5c0f\u533a, \u829c\u6e56\u8def\u8857\u9053, \u5408\u80a5\u5e02\u533a, \u5408\u80a5\u5e02, \u5b89\u5fbd\u7701, 230026, \u4e2d\u56fd", "lat": "31.83907195", "lng": "117.26420748", "type": "edu", "country": "China"}], "year": "2019", "pdf": ["https://arxiv.org/pdf/1903.12058.pdf"], "doi": []}, {"id": "6f0ce4d957c4e9556b04b539105837a5db63b925", "title": "Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis", "addresses": [{"name": "Google", "source_name": "Google, Inc.", "street_adddress": "1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA", "lat": "37.42199990", "lng": "-122.08405750", "type": "company", "country": "United States"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1806.04558.pdf"], "doi": []}, {"id": "b3918fab36f106e83e016a3e33d260ad656191c4", "title": "MCE 2018: The 1st Multi-target Speaker Detection and Identification Challenge Evaluation", "addresses": [{"name": "Johns Hopkins University", "source_name": "Johns Hopkins University", "street_adddress": "Baltimore, MD 21218, USA", "lat": "39.32990130", "lng": "-76.62051770", "type": "edu", "country": "United States"}, {"name": "MIT Lincoln Laboratory, Lexington, MA, USA", "source_name": "MIT Lincoln Laboratory, Lexington, MA, USA", "street_adddress": "244 Wood St, Lexington, MA 02421, USA", "lat": "42.45895890", "lng": "-71.26749520", "type": "edu", "country": "United States"}], "year": "2018", "pdf": ["https://arxiv.org/pdf/1904.04240.pdf"], "doi": []}]}