summaryrefslogtreecommitdiff
path: root/site/datasets/citations/vqa.json
blob: 11db0453181adc1667f6ba5996e1a69d1f6436b9 (plain)
1
{"id": "01959ef569f74c286956024866c1d107099199f7", "paper": {"paperId": "01959ef569f74c286956024866c1d107099199f7", "key": "vqa", "title": "VQA: Visual Question Answering", "journal": "2015 IEEE International Conference on Computer Vision (ICCV)", "address": "", "country": "", "address_type": "", "lat": "", "lng": "", "pdf_link": "https://arxiv.org/pdf/1505.00468.pdf", "report_link": "papers/01959ef569f74c286956024866c1d107099199f7.html", "citation_count": 731, "citations_geocoded": 445, "citations_unknown": 286, "citations_empty": 47, "citations_pdf": 629, "citations_doi": 96, "name": "VQA"}, "address": null, "citations": [["Query-Focused Video Summarization: Dataset, Evaluation, and a Memory Network Based Approach", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2017"], ["A Unified Framework for Multimodal Domain Adaptation", "Hefei University of Technology, HeFei, China", "Hefei University of Technology", "Hefei University of Technology", "\u5408\u80a5\u5de5\u4e1a\u5927\u5b66\uff08\u5c6f\u6eaa\u8def\u6821\u533a\uff09, 193\u53f7, \u5357\u4e00\u73af\u8def, \u822a\u8fd0\u5357\u6751, \u5305\u516c\u8857\u9053, \u5408\u80a5\u5e02\u533a, \u5408\u80a5\u5e02, \u5b89\u5fbd\u7701, 230009, \u4e2d\u56fd", "31.84691800", "117.29053367", "edu", "", "China", "2018"], ["Multimodal sentiment analysis with word-level fusion and reinforcement learning", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Video Fill in the Blank with Merging LSTMs", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2016"], ["Visual Question Generation as Dual Task of Visual Question Answering", "", "Microsoft Research Asia", "Microsoft Research Asia", "1 Memorial Dr, Cambridge, MA 02142, USA", "42.36142560", "-71.08120920", "company", "", "United States", "2017"], ["Convolutional Network for Attribute-driven and Identity-preserving Human Face Generation", "", "Harbin Institute of Technology", "Harbin Institute of Technology", "\u54c8\u5c14\u6ee8\u5de5\u4e1a\u5927\u5b66, \u53f8\u4ee4\u8857, \u5357\u5c97\u533a, \u54c8\u5c14\u6ee8\u5e02 / Harbin, \u9ed1\u9f99\u6c5f\u7701, 150000, \u4e2d\u56fd", "45.74139210", "126.62552755", "edu", "", "China", "2016"], ["Priming Neural Networks", "", "York University", "York University", "York University, Keele Campus, Campus Walk, North York, Toronto, Ontario, M3J 2S5, Canada", "43.77439110", "-79.50481085", "edu", "", "Canada", "2017"], ["Multimodal Dual Attention Memory for Video Story Question Answering", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2018"], ["VQS: Linking Segmentations to Questions and Answers for Supervised Attention in VQA and Question-Focused Semantic Segmentation", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2017"], ["Mention Recommendation for Multimodal Microblog with Cross-attention Memory Network", "Shandong University, Jinan, China", "Shandong University", "Shandong University", "\u5c71\u4e1c\u5927\u5b66, \u6cf0\u5b89\u8857, \u9ccc\u5c71\u536b\u8857\u9053, \u5373\u58a8\u533a, \u9752\u5c9b\u5e02, \u5c71\u4e1c\u7701, 266200, \u4e2d\u56fd", "36.36934730", "120.67381800", "edu", "", "China", "2018"], ["Commonsense Justification for Action Explanation", "", "Michigan State University", "Michigan State University", "Michigan State University, Farm Lane, East Lansing, Ingham County, Michigan, 48824, USA", "42.71856800", "-84.47791571", "edu", "", "United States", "2018"], ["Cross-modal Metric Learning with Graph Embedding", "Department of Electronic Engineering, Fudan University, Shanghai, 200433, China", "Fudan University", "Fudan University", "\u590d\u65e6\u5927\u5b66, 220, \u90af\u90f8\u8def, \u4e94\u89d2\u573a\u8857\u9053, \u6768\u6d66\u533a, \u4e0a\u6d77\u5e02, 200433, \u4e2d\u56fd", "31.30104395", "121.50045497", "edu", "", "China", "2018"], ["Language-Based Image Editing with Recurrent Attentive Models", "", "University of California", "University of California", "Berkeley, CA, USA", "37.87189920", "-122.25853990", "edu", "", "United States", "2017"], ["Deep Learning Based Multi-modal Addressee Recognition in Visual Scenes with Utterances", "", "Tokyo Institute of Technology", "Tokyo Institute of Technology", "\u6771\u4eac\u5de5\u696d\u5927\u5b66, \u539a\u6728\u8857\u9053, \u7dd1\u533a, \u753a\u7530\u5e02, \u795e\u5948\u5ddd\u770c, \u95a2\u6771\u5730\u65b9, 226-0026, \u65e5\u672c", "35.51675380", "139.48342251", "edu", "", "Japan", "2018"], ["Question Relevance in Visual Question Answering", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2018"], ["C-VQA: A Compositional Split of the Visual Question Answering (VQA) v1.0 Dataset", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["Recent Advances in Neural Program Synthesis", "", "University of California, Berkeley", "University of California, Berkeley", "Berkeley Art Museum and Pacific Film Archive, Bancroft Way, Southside, Berkeley, Alameda County, California, 94720-1076, USA", "37.86871260", "-122.25586815", "edu", "", "United States", "2018"], ["Neural Module Networks", "", "University of California, Berkeley", "University of California, Berkeley", "Berkeley Art Museum and Pacific Film Archive, Bancroft Way, Southside, Berkeley, Alameda County, California, 94720-1076, USA", "37.86871260", "-122.25586815", "edu", "", "United States", "2016"], ["Guide Me: Interacting with Deep Networks", "", "Johns Hopkins University", "Johns Hopkins University", "Baltimore, MD 21218, USA", "39.32990130", "-76.62051770", "edu", "", "", "2018"], ["Solving Visual Madlibs with Multiple Cues", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2016"], ["Learning Subjective Adjectives from Images by Stacked Convolutional Auto-Encoders", "", "Kyoto University", "Kyoto University", "\u4eac\u90fd\u5927\u5b66, \u4eca\u51fa\u5ddd\u901a, \u5409\u7530\u6cc9\u6bbf\u753a, \u5de6\u4eac\u533a, \u4eac\u90fd\u5e02, \u4eac\u90fd\u5e9c, \u8fd1\u757f\u5730\u65b9, 606-8501, \u65e5\u672c", "35.02749960", "135.78154513", "edu", "", "Japan", "2017"], ["Multimodal Neural Machine Translation for Low-resource Language Pairs using Synthetic Data", "", "Dublin City University", "DUBLIN CITY UNIVERSITY", "Dublin City University Glasnevin Campus, Lower Car Park, Wad, Whitehall A ED, Dublin 9, Dublin, County Dublin, Leinster, D09 FW22, Ireland", "53.38522185", "-6.25740874", "edu", "", "Ireland", "2018"], ["Examining Cooperation in Visual Dialog Models", "", "University of Amsterdam", "University of Amsterdam", "Institute for Logic, Language and Computation (ILLC), 107, Science Park, Oost-Watergraafsmeer, Amsterdam, Oost, Amsterdam, Noord-Holland, Nederland, 1098XG, Nederland", "52.35536550", "4.95016440", "edu", "", "Netherlands", "2017"], ["Neural Baby Talk", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["Convolutional Neural Networks for Aerial Vehicle Detection and Recognition", "", "University of Liverpool", "University of Liverpool", "Victoria Building, Brownlow Hill, Knowledge Quarter, Liverpool, North West England, England, L3, UK", "53.40617900", "-2.96670819", "edu", "", "United Kingdom", "2018"], ["Stacked Attention Networks for Image Question Answering", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2016"], ["Creativity: Generating Diverse Questions Using Variational Autoencoders", "", "Northwestern University", "Northwestern University", "Northwestern University, Northwestern Place, Downtown, Evanston, Cook County, Illinois, 60208, USA", "42.05511640", "-87.67581113", "edu", "", "United States", "2017"], ["A Cost-Sensitive Visual Question-Answer Framework for Mining a Deep And-OR Object Semantics from Web Images", "", "University of California", "University of California", "Berkeley, CA, USA", "37.87189920", "-122.25853990", "edu", "", "United States", "2017"], ["Question Answering under Instructor Guidance with Memory Networks", "", "Tsinghua University", "Tsinghua University", "\u6e05\u534e\u5927\u5b66, 30, \u53cc\u6e05\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100084, \u4e2d\u56fd", "40.00229045", "116.32098908", "edu", "", "China", ""], ["Deep Variation-Structured Reinforcement Learning for Visual Relationship and Attribute Detection", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["VQA-E: Explaining, Elaborating, and Enhancing Your Answers for Visual Questions", "", "University of Science and Technology of China", "University of Science and Technology of China", "\u4e2d\u56fd\u79d1\u5b66\u6280\u672f\u5927\u5b66 \u4e1c\u6821\u533a, 96\u53f7, \u91d1\u5be8\u8def, \u6c5f\u6dee\u5316\u80a5\u5382\u5c0f\u533a, \u829c\u6e56\u8def\u8857\u9053, \u5408\u80a5\u5e02\u533a, \u5408\u80a5\u5e02, \u5b89\u5fbd\u7701, 230026, \u4e2d\u56fd", "31.83907195", "117.26420748", "edu", "", "China", "2018"], ["Word-to-region attention network for visual question answering", "Center for Future Media and School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China", "University of Electronic Science and Technology of China", "University of Electronic Science and Technology of China", "Columbus, OH 43210, USA", "40.01419050", "-83.03091430", "edu", "", "United States", "2018"], ["Semi-supervised vision-language mapping via variational learning", "School of Computing Sciences, University of East Anglia", "University of East Anglia", "University of East Anglia", "Arts (Lower Walkway Level), The Square, Westfield View, Earlham, Norwich, Norfolk, East of England, England, NR4 7TJ, UK", "52.62215710", "1.24091360", "edu", "", "United Kingdom", "2017"], ["Answerer in Questioner's Mind for Goal-Oriented Visual Dialogue", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2017"], ["Conversational Image Editing: Incremental Intent Identification in a New Dialogue Task", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Visual Translation Embedding Network for Visual Relation Detection", "", "Columbia University", "Columbia University", "Columbia University Medical Center, 630, West 168th Street, Washington Heights, Manhattan, Manhattan Community Board 12, New York County, NYC, New York, 10031, USA", "40.84198360", "-73.94368971", "edu", "", "United States", "2017"], ["Being Negative but Constructively: Lessons Learnt from Creating Better Visual Question Answering Datasets", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Image as Data: Automated Visual Content Analysis for Political Science", "", "University of California, Los Angeles", "University of California, Los Angeles", "200 UCLA, Medical Plaza Driveway Suite 540, Los Angeles, CA 90095, USA", "34.06877880", "-118.44500940", "edu", "", "United States", "2018"], ["Input Image : Smile Intensity Generated Responses : Input Question : Input", "", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2018"], ["Convolutional Neural Networks for Aerial Multi-Label Pedestrian Detection", "", "West Virginia University", "West Virginia University", "88, Windsor Avenue, The Flatts, Morgantown, Monongalia County, West Virginia, 26505, USA", "39.65404635", "-79.96475355", "edu", "", "United States", "2018"], ["Question Type Guided Attention in Visual Question Answering", "", "California Institute of Technology", "California Institute of Technology", "California Institute of Technology, San Pasqual Walk, Madison Heights, Pasadena, Los Angeles County, California, 91126, USA", "34.13710185", "-118.12527487", "edu", "", "United States", "2018"], ["Are You Talking to Me? Reasoned Visual Dialog Generation through Adversarial Learning", "", "Northwestern Polytechnical University", "Northwestern Polytechnical University", "\u897f\u5317\u5de5\u4e1a\u5927\u5b66 \u53cb\u8c0a\u6821\u533a, 127\u53f7, \u53cb\u8c0a\u897f\u8def, \u957f\u5b89\u8def, \u7891\u6797\u533a (Beilin), \u897f\u5b89\u5e02, \u9655\u897f\u7701, 710072, \u4e2d\u56fd", "34.24691520", "108.91061982", "edu", "", "China", "2017"], ["Dual Attention Network for Visual Question Answering", "", "Boston University", "Boston University", "BU, Bay State Road, Fenway, Boston, Suffolk County, Massachusetts, 02215, USA", "42.35042530", "-71.10056114", "edu", "", "United States", "2017"], ["The Color of the Cat is Gray: 1 Million Full-Sentences Visual Question Answering (FSVQA).", "", "University of Tokyo", "University of Tokyo", "\u6771\u4eac\u5927\u5b66 \u67cf\u30ad\u30e3\u30f3\u30d1\u30b9, \u5b66\u878d\u5408\u306e\u9053, \u67cf\u5e02, \u5343\u8449\u770c, \u95a2\u6771\u5730\u65b9, 277-8583, \u65e5\u672c", "35.90204480", "139.93622009", "edu", "", "Japan", "2016"], ["From VQA to Multimodal CQA: Adapting Visual QA Models for Community QA Tasks", "", "Waseda University", "Waseda University", "\u65e9\u7a32\u7530\u5927\u5b66 \u5317\u4e5d\u5dde\u30ad\u30e3\u30f3\u30d1\u30b9, 2-2, \u6709\u6bdb\u5f15\u91ce\u7dda, \u516b\u5e61\u897f\u533a, \u5317\u4e5d\u5dde\u5e02, \u798f\u5ca1\u770c, \u4e5d\u5dde\u5730\u65b9, 808-0135, \u65e5\u672c", "33.88987280", "130.70856205", "edu", "", "Japan", "2018"], ["Learning to Describe Differences Between Pairs of Similar Images", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2018"], ["Visual Question Reasoning on General Dependency Tree", "", "Sun Yat-Sen University", "Sun Yat-Sen University", "\u4e2d\u5927, \u65b0\u6e2f\u897f\u8def, \u9f99\u8239\u6ed8, \u5eb7\u4e50, \u6d77\u73e0\u533a (Haizhu), \u5e7f\u5dde\u5e02, \u5e7f\u4e1c\u7701, 510105, \u4e2d\u56fd", "23.09461185", "113.28788994", "edu", "", "China", "2018"], ["Where to Look: Focus Regions for Visual Question Answering", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2016"], ["A Knowledge-Grounded Multimodal Search-Based Conversational Agent", "", "Heriot-Watt University", "Heriot-Watt University", "Heriot-Watt University - Edinburgh Campus, Third Gait, Currie, Gogarbank, City of Edinburgh, Scotland, EH14 4AS, UK", "55.91029135", "-3.32345777", "edu", "", "United Kingdom", "2018"], ["Learning Language-Visual Embedding for Movie Understanding with Natural-Language", "", "Max Planck Institute for Informatics", "Max Planck Institute for Informatics", "MPII, E1 4, Campus, Universit\u00e4t, Sankt Johann, Bezirk Mitte, Saarbr\u00fccken, Regionalverband Saarbr\u00fccken, Saarland, 66123, Deutschland", "49.25795660", "7.04577417", "edu", "", "Germany", "2016"], ["Learning Interpretable Spatial Operations in a Rich 3D Blocks World", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2018"], ["Interpreting Visual Question Answering Models", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Whodunnit? Crime Drama as a Case for Natural Language Understanding", "", "University of Edinburgh", "University of Edinburgh", "New College, New College Courtyard, The Mound, Old Town, Edinburgh, City of Edinburgh, Scotland, EH1 2LX, UK", "55.94951105", "-3.19534913", "edu", "", "United Kingdom", "2018"], ["Video Question Answering via Attribute-Augmented Attention Network Learning", "", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2017"], ["ABC-CNN: An Attention Based Convolutional Neural Network for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2015"], ["How2: A Large-scale Dataset for Multimodal Language Understanding", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2018"], ["Can Saliency Information Benefit Image Captioning Models?", "", "Aalto University", "Aalto University", "Aalto, 24, Otakaari, Otaniemi, Suur-Tapiola, Espoo, Helsingin seutukunta, Uusimaa, Etel\u00e4-Suomi, Manner-Suomi, 02150, Suomi", "60.18558755", "24.82427330", "edu", "", "Finland", "2017"], ["Unsupervised Selection of Negative Examples for Grounded Language Learning", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2018"], ["Data Augmentation for Visual Question Answering", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2017"], ["Inferring and Executing Programs for Visual Reasoning Supplementary Material", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["CrowdVerge: Predicting If People Will Agree on the Answer to a Visual Question", "The University of Texas at Austin, Austin, TX, USA", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2017"], ["Bidirectional Beam Search: Forward-Backward Inference in Neural Sequence Models for Fill-in-the-Blank Image Captioning", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["FlipDial: A Generative Model for Two-Way Visual Dialogue", "", "University of Oxford", "University of Oxford", "Radcliffe Camera, Radcliffe Square, Grandpont, Oxford, Oxon, South East, England, OX1 4AJ, UK", "51.75345380", "-1.25400997", "edu", "", "United Kingdom", "2018"], ["A Dataset and Architecture for Visual Reasoning with a Working Memory", "", "Columbia University", "Columbia University", "Columbia University Medical Center, 630, West 168th Street, Washington Heights, Manhattan, Manhattan Community Board 12, New York County, NYC, New York, 10031, USA", "40.84198360", "-73.94368971", "edu", "", "United States", "2018"], ["Nothing Else Matters: Model-Agnostic Explanations By Identifying Prediction Invariance", "", "University of Washington", "University of Washington", "University of Washington, Rainier Vista, Montlake, University District, Seattle, King County, Washington, 98195, USA", "47.65432380", "-122.30800894", "edu", "", "United States", "2016"], ["Deep Learning for Image-to-Text Generation: A Technical Overview", "Deep Learning Group, Microsoft Research, Redmond, Washington United States", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2017"], ["Adding object detection skills to visual dialogue agents", "", "University of Amsterdam", "University of Amsterdam", "Institute for Logic, Language and Computation (ILLC), 107, Science Park, Oost-Watergraafsmeer, Amsterdam, Oost, Amsterdam, Noord-Holland, Nederland, 1098XG, Nederland", "52.35536550", "4.95016440", "edu", "", "Netherlands", "2018"], ["Image Visual Realism: From Human Perception to Machine Computation", "Smart Systems Institute, National University of Singapore, Singapore", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2018"], ["Visual Choice of Plausible Alternatives: An Evaluation of Image-based Commonsense Causal Reasoning", "", "Yonsei University", "Yonsei University", "\uc5f0\uc138\ub300, \uc5f0\uc138\ub85c, \uc2e0\ucd0c\ub3d9, \ucc3d\ucc9c\ub3d9, \uc11c\ub300\ubb38\uad6c, \uc11c\uc6b8\ud2b9\ubcc4\uc2dc, 03789, \ub300\ud55c\ubbfc\uad6d", "37.56004060", "126.93692480", "edu", "", "South Korea", "2018"], ["Learning to Disambiguate by Asking Discriminative Questions", "", "Robotics Institute", "Robotics Institute", "Institute for Field Robotics, \u0e1b\u0e23\u0e30\u0e0a\u0e32\u0e2d\u0e38\u0e17\u0e34\u0e28, \u0e01\u0e23\u0e38\u0e07\u0e40\u0e17\u0e1e\u0e21\u0e2b\u0e32\u0e19\u0e04\u0e23, \u0e40\u0e02\u0e15\u0e23\u0e32\u0e29\u0e0e\u0e23\u0e4c\u0e1a\u0e39\u0e23\u0e13\u0e30, \u0e01\u0e23\u0e38\u0e07\u0e40\u0e17\u0e1e\u0e21\u0e2b\u0e32\u0e19\u0e04\u0e23, 10140, \u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\u0e22", "13.65450525", "100.49423171", "edu", "", "Thailand", "2017"], ["Visual Question Answering With a Hybrid Convolution Recurrent Model", "University of Augsburg, Augsburg, Germany", "University of Augsburg", "Multimedia Computing Lab, University of Augsburg, Germany", "Universit\u00e4tsstra\u00dfe 2, 86159 Augsburg, Germany", "48.33282440", "10.89656160", "edu", "", "Germany", "2018"], ["Exploiting Semantic Contextualization for Interpretation of Human Activity in Videos", "", "University of South Florida", "University of South Florida", "University of South Florida, Leroy Collins Boulevard, Tampa, Hillsborough County, Florida, 33620, USA", "28.05999990", "-82.41383619", "edu", "", "United States", "2017"], ["Adaptively Attending to Visual Attributes and Linguistic Knowledge for Captioning", "University of Queensland, Brisbane, Australia", "University of Queensland", "University of Queensland", "University of Queensland, University Drive, Hill End, St Lucia, Brisbane, QLD, 4072, Australia", "-27.49741805", "153.01316956", "edu", "", "Australia", "2017"], ["ChestX-Ray8: Hospital-Scale Chest X-Ray Database and Benchmarks on Weakly-Supervised Classification and Localization of Common Thorax Diseases", "", "National Institutes of Health", "National Institutes of Health", "NIH, Pooks Hill, Bethesda, Montgomery County, Maryland, USA", "39.00041165", "-77.10327775", "edu", "", "United States", "2017"], ["Commonly Uncommon: Semantic Sparsity in Situation Recognition", "", "University of Virginia", "University of Virginia", "University of Virginia, Rotunda Alley, Carr's Hill, Albemarle County, Virginia, 22904-4119, USA", "38.03536820", "-78.50353220", "edu", "", "United States", "2017"], ["Deep Sparse Coding for Invariant Multimodal Halle Berry Neurons", "", "Los Alamos National Laboratory", "Los Alamos National Laboratory", "New Mexico 87545, USA", "35.84405820", "-106.28716200", "gov", "", "", "2017"], ["Challenging Images For Minds and Machines", "", "York University", "York University", "York University, Keele Campus, Campus Walk, North York, Toronto, Ontario, M3J 2S5, Canada", "43.77439110", "-79.50481085", "edu", "", "Canada", "2018"], ["Tensor Fusion Network for Multimodal Sentiment Analysis", "", "Singapore", "Singapore", "Singapore", "1.35208300", "103.81983600", "edu", "", "Singapore", "2017"], ["EANN: Event Adversarial Neural Networks for Multi-Modal Fake News Detection", "University of Chinese Academy of Sciences, Beijing, China", "University of Chinese Academy of Sciences", "University of Chinese Academy of Sciences", "University of Chinese Academy of Sciences, UCAS, Yuquanlu, \u7389\u6cc9\u8def, \u7530\u6751, \u6d77\u6dc0\u533a, 100049, \u4e2d\u56fd", "39.90828040", "116.24585270", "edu", "", "China", "2018"], ["Annotation Artifacts in Natural Language Inference Data", "", "New York University", "New York University", "NYU, West 4th Street, NoHo Historic District, NoHo, Manhattan, Manhattan Community Board 2, New York County, NYC, New York, 10012, USA", "40.72925325", "-73.99625394", "edu", "", "United States", "2018"], ["SCA-CNN: Spatial and Channel-Wise Attention in Convolutional Networks for Image Captioning", "", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2017"], ["Guest Editorial: Image and Language Understanding", "Google, Seattle, USA", "Google", "Google, Inc.", "1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA", "37.42199990", "-122.08405750", "company", "Google, Mountain View, CA", "United States", "2017"], ["MS-RMAC: Multiscale Regional Maximum Activation of Convolutions for Image Retrieval", "College of Command Information Systems, PLA University of Science and Technology, Nanjing, China", "PLA University of Science and Technology", "PLA University of Science and Technology, China", "China, \u53cc\u9f99\u885760\u53f7", "31.97090700", "118.81289890", "edu", "", "China", "2017"], ["Representation Learning of Knowledge Graphs with Entity Attributes and Multimedia Descriptions", "Institute of Automation Chinese Academy of Sciences, National Lab of Pattern Recognition, Beijing, China", "Chinese Academy of Sciences", "Chinese Academy of Sciences", "\u4e2d\u56fd\u79d1\u5b66\u9662\u5fc3\u7406\u7814\u7a76\u6240, 16, \u6797\u8403\u8def, \u671d\u9633\u533a / Chaoyang, \u5317\u4eac\u5e02, 100101, \u4e2d\u56fd", "40.00447950", "116.37023800", "edu", "", "China", "2018"], ["MSRC: multimodal spatial regression with semantic context for phrase grounding", "Institute for Robotics and Intelligent Systems, University of Southern California, Los Angeles, USA", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2017"], ["Hierarchical Co-Attention for Visual Question Answering", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Visual Spatial Attention Network for Relationship Detection", "University of Electronic Science and Technology of China, Chengdu, China", "University of Electronic Science and Technology of China", "University of Electronic Science and Technology of China", "Columbus, OH 43210, USA", "40.01419050", "-83.03091430", "edu", "", "United States", "2018"], ["Attentive Explanations: Justifying Decisions and Pointing to the Evidence", "", "Max Planck Institute for Informatics", "Max Planck Institute for Informatics", "MPII, E1 4, Campus, Universit\u00e4t, Sankt Johann, Bezirk Mitte, Saarbr\u00fccken, Regionalverband Saarbr\u00fccken, Saarland, 66123, Deutschland", "49.25795660", "7.04577417", "edu", "", "Germany", "2016"], ["Relation Networks for Object Detection", "", "Peking University", "Peking University", "\u5317\u4eac\u5927\u5b66, 5\u53f7, \u9890\u548c\u56ed\u8def, \u7a3b\u9999\u56ed\u5357\u793e\u533a, \u6d77\u6dc0\u533a, \u5317\u4eac\u5e02, 100871, \u4e2d\u56fd", "39.99223790", "116.30393816", "edu", "", "China", "2017"], ["Joint Global and Co-Attentive Representation Learning for Image-Sentence Retrieval", "Chinese Academy of Sciences, Beijing, China", "Chinese Academy of Sciences", "Chinese Academy of Sciences", "\u4e2d\u56fd\u79d1\u5b66\u9662\u5fc3\u7406\u7814\u7a76\u6240, 16, \u6797\u8403\u8def, \u671d\u9633\u533a / Chaoyang, \u5317\u4eac\u5e02, 100101, \u4e2d\u56fd", "40.00447950", "116.37023800", "edu", "", "China", "2018"], ["Speech-Based Visual Question Answering", "", "ETH Zurich", "ETH Zurich", "R\u00e4mistrasse 101, 8092 Z\u00fcrich, Switzerland", "47.37631300", "8.54766990", "edu", "", "Switzerland", "2017"], ["Attention on Attention: Architectures for Visual Question Answering (VQA).", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2018"], ["Visual saliency computation for image analysis", "", "Boston University", "Boston University", "BU, Bay State Road, Fenway, Boston, Suffolk County, Massachusetts, 02215, USA", "42.35042530", "-71.10056114", "edu", "", "United States", "2016"], ["Introduction to Deep Learning Business Applications for Developers", "Link\u00f6ping, Sweden", "Link\u00f6ping, Sweden", "Link\u00f6ping, Sweden", "Link\u00f6ping, Sweden", "58.41080700", "15.62137280", "edu", "", "Sweden", "2018"], ["Knowledge-aware Multimodal Dialogue Systems", "National University of Singapore, Singapore, Singapore", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2018"], ["R-VQA: Learning Visual Relation Facts with Semantic Attention for Visual Question Answering", "", "Tsinghua University", "Tsinghua University", "\u6e05\u534e\u5927\u5b66, 30, \u53cc\u6e05\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100084, \u4e2d\u56fd", "40.00229045", "116.32098908", "edu", "", "China", "2018"], ["Where is Misty? Interpreting Spatial Descriptors by Modeling Regions in Space", "", "University of California, Berkeley", "University of California, Berkeley", "Berkeley Art Museum and Pacific Film Archive, Bancroft Way, Southside, Berkeley, Alameda County, California, 94720-1076, USA", "37.86871260", "-122.25586815", "edu", "", "United States", "2017"], ["Visual Reasoning with Multi-hop Feature Modulation", "", "Rice University", "Rice University", "Rice University, Stockton Drive, Houston, Harris County, Texas, 77005-1890, USA", "29.71679145", "-95.40478113", "edu", "", "United States", "2018"], ["Fast Parameter Adaptation for Few-shot Image Captioning and Visual Question Answering", "Zhejiang University, Zhejiang, China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2018"], ["Pre-Consulting Dialogue Systems for Telemedicine: Yes/No Intent Classification", "Kyushu Institute of Technology, Tobata, Kitakyushu, Fukuoka, Japan", "Kyushu Institute of Technology, Tobata, Kitakyushu, Fukuoka, Japan", "Kyushu Institute of Technology, Tobata, Kitakyushu, Fukuoka, Japan", "1-1 \u4ed9\u6c34\u753a \u6238\u7551\u533a \u5317\u4e5d\u5dde\u5e02 \u798f\u5ca1\u770c 804-8550, Japan", "33.89419680", "130.83940830", "edu", "", "Japan", "2018"], ["Cross-Modal and Hierarchical Modeling of Video and Text", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Think Visually: Question Answering through Virtual Imagery", "", "University of Michigan", "University of Michigan", "University of Michigan, 500, Hayward Street, Ann Arbor, Washtenaw County, Michigan, 48109, USA", "42.29421420", "-83.71003894", "edu", "", "United States", "2018"], ["Neural Motifs: Scene Graph Parsing with Global Context", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Focal Visual-Text Attention for Visual Question Answering", "", "Google", "Google, Inc.", "1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA", "37.42199990", "-122.08405750", "company", "Google, Mountain View, CA", "United States", "2018"], ["Visual Question: Predicting If a Crowd Will Agree on the Answer", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2016"], ["Natural Language Understanding with Distributed Representation", "", "New York University", "New York University", "NYU, West 4th Street, NoHo Historic District, NoHo, Manhattan, Manhattan Community Board 2, New York County, NYC, New York, 10012, USA", "40.72925325", "-73.99625394", "edu", "", "United States", "2015"], ["Show, Reward and Tell: Automatic Generation of Narrative Paragraph From Photo Stream by Adversarial Training", "", "Microsoft Research Asia", "Microsoft Research Asia", "1 Memorial Dr, Cambridge, MA 02142, USA", "42.36142560", "-71.08120920", "company", "", "United States", "2018"], ["Joint Image Captioning and Question Answering", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2018"], ["Question action relevance and editing for visual question answering", "Dipartimento di Informatica, Universit\u00e0 di Salerno, Fisciano, Italy", "Dipartimento di Informatica, Universit\u00e0 di Salerno, Fisciano, Italy", "Dipartimento di Informatica, Universit\u00e0 di Salerno, Fisciano, Italy", "Universit\u00e0 di Salerno Via Papa, Via Giovanni Paolo II, 132, 84084 Fisciano SA, Italy", "40.77434920", "14.78901500", "edu", "", "Italy", "2018"], ["LearningWord Embeddings for Low-resource Languages by PU Learning", "", "University of California Los Angeles", "University of California Los Angeles", "Los Angeles, CA 90095, USA", "34.06892100", "-118.44518110", "edu", "", "United States", "2018"], ["Considerations for Evaluating Models of Language Understanding and Reasoning", "", "University of Cambridge", "University of Cambridge", "Clifford Allbutt Lecture Theatre, Robinson Way, Romsey, Cambridge, Cambridgeshire, East of England, England, CB2 0QH, UK", "52.17638955", "0.14308882", "edu", "", "United Kingdom", "2015"], ["Automatic Description Generation from Images: A Survey of Models, Datasets, and Evaluation Measures", "", "University of Copenhagen", "University of Copenhagen", "K\u00f8benhavns Universitet, Krystalgade, K\u00f8dbyen, Vesterbro, K\u00f8benhavn, K\u00f8benhavns Kommune, Region Hovedstaden, 1165, Danmark", "55.68015020", "12.57232700", "edu", "", "Denmark", "2017"], ["Share-and-Chat: Achieving Human-Level Video Commenting by Search and Multi-View Embedding", "Sun Yat-Sen University, Guangzhou, China", "Sun Yat-Sen University", "Sun Yat-Sen University", "\u4e2d\u5927, \u65b0\u6e2f\u897f\u8def, \u9f99\u8239\u6ed8, \u5eb7\u4e50, \u6d77\u73e0\u533a (Haizhu), \u5e7f\u5dde\u5e02, \u5e7f\u4e1c\u7701, 510105, \u4e2d\u56fd", "23.09461185", "113.28788994", "edu", "", "China", "2016"], ["Men Also Like Shopping: Reducing Gender Bias Amplification using Corpus-level Constraints", "", "University of Washington", "University of Washington", "University of Washington, Rainier Vista, Montlake, University District, Seattle, King County, Washington, 98195, USA", "47.65432380", "-122.30800894", "edu", "", "United States", "2017"], ["Compact Tensor Pooling for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2017"], ["On the Flip Side: Identifying Counterexamples in Visual Question Answering", "", "Harvard University", "Harvard University", "Harvard University, Soldiers Field Road, Allston, Boston, Suffolk County, Massachusetts, 02163, USA", "42.36782045", "-71.12666653", "edu", "", "United States", "2018"], ["VISALOGY: Answering Visual Analogy Questions", "", "University of Washington", "University of Washington", "University of Washington, Rainier Vista, Montlake, University District, Seattle, King County, Washington, 98195, USA", "47.65432380", "-122.30800894", "edu", "", "United States", "2015"], ["How emotional are you? Neural Architectures for Emotion Intensity Prediction in Microblogs", "", "Varanasi", "Varanasi", "Varanasi, Uttar Pradesh, India", "25.31764520", "82.97391440", "edu", "", "India", "2018"], ["Multi-Modal Information Extraction in a Question-Answer Framework", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Prominent Object Detection and Recognition: A Saliency-based Pipeline.", "", "Aalto University", "Aalto University", "Aalto, 24, Otakaari, Otaniemi, Suur-Tapiola, Espoo, Helsingin seutukunta, Uusimaa, Etel\u00e4-Suomi, Manner-Suomi, 02150, Suomi", "60.18558755", "24.82427330", "edu", "", "Finland", "2017"], ["On Available Corpora for Empirical Methods in Vision & Language", "", "Johns Hopkins University", "Johns Hopkins University", "Baltimore, MD 21218, USA", "39.32990130", "-76.62051770", "edu", "", "", "2015"], ["Situation Recognition: Visual Semantic Role Labeling for Image Understanding", "", "University of Washington", "University of Washington", "University of Washington, Rainier Vista, Montlake, University District, Seattle, King County, Washington, 98195, USA", "47.65432380", "-122.30800894", "edu", "", "United States", "2016"], ["Greedy Inference Algorithms for Structured and Neural Models", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2018"], ["Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models", "", "Indiana University", "Indiana University", "Indiana University East, West Cart Road, Richmond, Wayne County, Indiana, 47374, USA", "39.86948105", "-84.87956905", "edu", "", "United States", "2016"], ["Answer-Type Prediction for Visual Question Answering", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2016"], ["Context-Dependent Diffusion Network for Visual Relationship Detection", "", "Southeast University", "Southeast University", "SEU, \u4f53\u80b2\u9986\u8def, \u65b0\u8857\u53e3, \u6708\u5b63\u56ed, \u7384\u6b66\u533a, \u5357\u4eac\u5e02, \u6c5f\u82cf\u7701, 210008, \u4e2d\u56fd", "32.05752790", "118.78682252", "edu", "", "China", "2018"], ["Identity-Aware Textual-Visual Matching with Latent Co-attention", "", "Chinese University of Hong Kong", "Chinese University of Hong Kong", "Hong Kong, \u99ac\u6599\u6c34\u6c60\u65c1\u8def", "22.41626320", "114.21093180", "edu", "", "China", "2017"], ["Referring Image Segmentation via Recurrent Refinement Networks", "", "Tencent", "Tencent", "1234 N Santa Monica Blvd, Beverly Hills, CA 90210, USA", "34.08038290", "-118.39099470", "company", "", "United States", ""], ["Categorizing Concepts with Basic Level for Vision-to-Language", "", "Tongji University", "Tongji University", "\u540c\u6d4e\u5927\u5b66, 1239, \u56db\u5e73\u8def, \u6c5f\u6e7e, \u8679\u53e3\u533a, \u4e0a\u6d77\u5e02, 200092, \u4e2d\u56fd", "31.28473925", "121.49694909", "edu", "", "China", ""], ["Deep Multimodal Reinforcement Network with Contextually Guided Recurrent Attention for Image Question Answering", "College of Computer and Information Engineering, Jiangxi Normal University, Nanchang, China", "Jiangxi Normal University", "Jiangxi Normal University, Nanchang, China", "Nanchang, China", "28.67850000", "116.03121200", "edu", "", "China", "2017"], ["Graph-Structured Representations for Visual Question Answering", "", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2017"], ["Deep Binaries: Encoding Semantic-Rich Cues for Efficient Textual-Visual Cross Retrieval", "", "University of Electronic Science and Technology of China", "University of Electronic Science and Technology of China", "Columbus, OH 43210, USA", "40.01419050", "-83.03091430", "edu", "", "United States", "2017"], ["Uncovering the Temporal Context for Video Question Answering", "SCS, Carnegie Mellon University, Pittsburgh, USA", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Co-Attending Free-Form Regions and Detections With Multi-Modal Multiplicative Feature Embedding for Visual Question Answering", "", "East China Normal University", "East China Normal University", "\u534e\u4e1c\u5e08\u8303\u5927\u5b66, 3663, \u4e2d\u5c71\u5317\u8def, \u66f9\u5bb6\u6e21, \u666e\u9640\u533a, \u666e\u9640\u533a (Putuo), \u4e0a\u6d77\u5e02, 200062, \u4e2d\u56fd", "31.22849230", "121.40211389", "edu", "", "China", "2018"], ["Learning to Reason: End-to-End Module Networks for Visual Question Answering", "", "Boston University", "Boston University", "BU, Bay State Road, Fenway, Boston, Suffolk County, Massachusetts, 02215, USA", "42.35042530", "-71.10056114", "edu", "", "United States", "2017"], ["Learning to Describe E-Commerce Images from Noisy Online Data", "", "Tohoku University", "Tohoku University", "Tohoku University, \u4e94\u6a4b\u901a, \u9752\u8449\u533a, \u4ed9\u53f0\u5e02, \u5bae\u57ce\u770c, \u6771\u5317\u5730\u65b9, 980-0811, \u65e5\u672c", "38.25309450", "140.87365930", "edu", "", "Japan", "2016"], ["Interpretable Basis Decomposition for Visual Explanation", "", "MIT CSAIL", "MIT CSAIL", "32 Vassar St, Cambridge, MA 02139, USA", "42.36194070", "-71.09043780", "edu", "", "United States", "2018"], ["Convolutional Image Captioning", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2017"], ["ViP-CNN: A Visual Phrase Reasoning Convolutional Neural Network for Visual Relationship Detection", "", "Hong Kong", "Hong Kong", "Hong Kong", "22.39642800", "114.10949700", "edu", "", "China", "2017"], ["RecipeQA: A Challenge Dataset for Multimodal Comprehension of Cooking Recipes", "", "Hacettepe University", "Hacettepe University", "Hacettepe \u00dcniversitesi Beytepe Kamp\u00fcs\u00fc, Hacettepe-Beytepe Kamp\u00fcs Yolu, \u00dcniversiteler Mahallesi, Ankara, \u00c7ankaya, Ankara, \u0130\u00e7 Anadolu B\u00f6lgesi, 06800, T\u00fcrkiye", "39.86742125", "32.73519072", "edu", "", "Turkey", "2018"], ["Deep Multimodal Learning: A Survey on Recent Advances and Trends", "University of Guelph, Guelph, Ontario, Canada", "University of Guelph", "University of Guelph, Guelph, Ontario, Canada", "University of Guelph, Guelph, ON N1G 1Y4, Canada", "43.52937320", "-80.22525020", "edu", "", "Canada", "2017"], ["Multimodal Fusion with Recurrent Neural Networks for Rumor Detection on Microblogs", "University of Rochester, Rochester, NY, USA", "University of Rochester", "University of Rochester", "Memorial Art Gallery, 500, University Avenue, East End, Rochester, Monroe County, New York, 14607, USA", "43.15769690", "-77.58829158", "edu", "", "United States", "2017"], ["Pivot Correlational Neural Network for Multimodal Video Categorization", "", "KAIST, Daejeon, South Korea", "KAIST, Daejeon, South Korea", "291 Daehak-ro, Eoeun-dong, Yuseong-gu, Daejeon, South Korea", "36.37214270", "127.36039000", "edu", "", "South Korea", "2018"], ["Building a Large-scale Multimodal Knowledge Base System for Answering Visual Queries", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2015"], ["Movie Fill in the Blank with Adaptive Temporal Attention and Description Update", "University of Electronic Science and Technology of China, Chengdu, China", "University of Electronic Science and Technology of China", "University of Electronic Science and Technology of China", "Columbus, OH 43210, USA", "40.01419050", "-83.03091430", "edu", "", "United States", "2017"], ["Multimodal Named Entity Recognition for Short Social Media Posts", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2018"], ["Visual Question Answering using Explicit Visual Attention", "Aristotle University of Thessaloniki, Dept. of Informatics, Thessaloniki, 54124,Greece", "Aristotle University of Thessaloniki", "Aristotle University of Thessaloniki", "\u0391\u03c1\u03b9\u03c3\u03c4\u03bf\u03c4\u03ad\u03bb\u03b5\u03b9\u03bf \u03a0\u03b1\u03bd\u03b5\u03c0\u03b9\u03c3\u03c4\u03ae\u03bc\u03b9\u03bf \u0398\u03b5\u03c3\u03c3\u03b1\u03bb\u03bf\u03bd\u03af\u03ba\u03b7\u03c2, \u0395\u03b3\u03bd\u03b1\u03c4\u03af\u03b1, \u03a3\u03b1\u03c1\u03ac\u03bd\u03c4\u03b1 \u0395\u03ba\u03ba\u03bb\u03b7\u03c3\u03af\u03b5\u03c2, \u0395\u03c5\u03b1\u03b3\u03b3\u03b5\u03bb\u03af\u03c3\u03c4\u03c1\u03b9\u03b1, \u0398\u03b5\u03c3\u03c3\u03b1\u03bb\u03bf\u03bd\u03af\u03ba\u03b7, \u0394\u03ae\u03bc\u03bf\u03c2 \u0398\u03b5\u03c3\u03c3\u03b1\u03bb\u03bf\u03bd\u03af\u03ba\u03b7\u03c2, \u03a0\u03b5\u03c1\u03b9\u03c6\u03b5\u03c1\u03b5\u03b9\u03b1\u03ba\u03ae \u0395\u03bd\u03cc\u03c4\u03b7\u03c4\u03b1 \u0398\u03b5\u03c3\u03c3\u03b1\u03bb\u03bf\u03bd\u03af\u03ba\u03b7\u03c2, \u03a0\u03b5\u03c1\u03b9\u03c6\u03ad\u03c1\u03b5\u03b9\u03b1 \u039a\u03b5\u03bd\u03c4\u03c1\u03b9\u03ba\u03ae\u03c2 \u039c\u03b1\u03ba\u03b5\u03b4\u03bf\u03bd\u03af\u03b1\u03c2, \u039c\u03b1\u03ba\u03b5\u03b4\u03bf\u03bd\u03af\u03b1 - \u0398\u03c1\u03ac\u03ba\u03b7, 54124, \u0395\u03bb\u03bb\u03ac\u03b4\u03b1", "40.62984145", "22.95889350", "edu", "", "Greece", "2018"], ["End-to-End Instance Segmentation and Counting with Recurrent Attention", "", "University of Toronto", "University of Toronto", "University of Toronto, St. George Street, Bloor Street Culture Corridor, Old Toronto, Toronto, Ontario, M5S 1A5, Canada", "43.66333345", "-79.39769975", "edu", "", "Canada", "2016"], ["Ask Me Anything: Free-Form Visual Question Answering Based on Knowledge from External Sources", "", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2016"], ["Learning Multi-Modal Navigation for Unmanned Ground Vehicles", "", "SUNY Buffalo", "SUNY Buffalo", "SUNY College at Buffalo, Academic Drive, Elmwood Village, Buffalo, Erie County, New York, 14222, USA", "42.93362780", "-78.88394479", "edu", "", "United States", "2018"], ["Video Question Answering via Gradually Refined Attention over Appearance and Motion", "Zhejiang University, Hangzhou, China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2017"], ["Sherlock: Scalable Fact Learning in Images", "", "Rutgers University", "Rutgers University", "Rutgers Cook Campus - North, Biel Road, New Brunswick, Middlesex County, New Jersey, 08901, USA", "40.47913175", "-74.43168868", "edu", "", "United States", "2017"], ["An Elevator Pitch on Deep Learning", "University of Illinois at Urbana-Champaign (UIUC)", "University of Illinois at Urbana-Champaign (UIUC)", "University of Illinois at Urbana-Champaign (UIUC)", "Champaign, IL, USA", "40.10195230", "-88.22716150", "edu", "", "United States", "2017"], ["Learn to Classify and Count: A Unified Framework for Object Classification and Counting", "University of Electronic Science and Technology of China, West Hi-Tech Zone, Chengdu, SiChuan, China", "University of Electronic Science and Technology of China", "University of Electronic Science and Technology of China", "Columbus, OH 43210, USA", "40.01419050", "-83.03091430", "edu", "", "United States", "2018"], ["SIGGRAPH Asia 2016: course notes directions in shape analysis towards functionality", "Shenzhen University", "Shenzhen University", "Shenzhen University", "\u6df1\u5733\u5927\u5b66, 3688, \u5357\u6d77\u5927\u9053, \u86c7\u53e3, \u540c\u4e50\u6751, \u5357\u5c71\u533a, \u6df1\u5733\u5e02, \u5e7f\u4e1c\u7701, 518060, \u4e2d\u56fd", "22.53521465", "113.93159110", "edu", "", "China", "2016"], ["Natural Language Video Description using Deep Recurrent Neural Networks", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2015"], ["Phrase Localization and Visual Relationship Detection with Comprehensive Image-Language Cues", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2017"], ["A Focused Dynamic Attention Model for Visual Question Answering", "", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2016"], ["UOUS AND DISCRETE ADDRESSING SCHEMES", "", "New York University", "New York University", "NYU, West 4th Street, NoHo Historic District, NoHo, Manhattan, Manhattan Community Board 2, New York County, NYC, New York, 10012, USA", "40.72925325", "-73.99625394", "edu", "", "United States", "2016"], ["Tackling the Story Ending Biases in The Story Cloze Test", "", "University of Rochester", "University of Rochester", "Memorial Art Gallery, 500, University Avenue, East End, Rochester, Monroe County, New York, 14607, USA", "43.15769690", "-77.58829158", "edu", "", "United States", "2018"], ["Coreset-Based Neural Network Compression", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2018"], ["The Amazing Mysteries of the Gutter: Drawing Inferences Between Panels in Comic Book Narratives", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2017"], ["Computer Vision \u2013 ECCV 2018", "Hebrew University of Jerusalem, Jerusalem, Israel", "Hebrew University of Jerusalem", "The Hebrew University of Jerusalem", "\u05d4\u05d0\u05d5\u05e0\u05d9\u05d1\u05e8\u05e1\u05d9\u05d8\u05d4 \u05d4\u05e2\u05d1\u05e8\u05d9\u05ea \u05d1\u05d9\u05e8\u05d5\u05e9\u05dc\u05d9\u05dd, Reagan Plaza, \u05e7\u05e8\u05d9\u05ea \u05de\u05e0\u05d7\u05dd \u05d1\u05d2\u05d9\u05df, \u05d4\u05e8 \u05d4\u05e6\u05d5\u05e4\u05d9\u05dd, \u05d9\u05e8\u05d5\u05e9\u05dc\u05d9\u05dd, \u05de\u05d7\u05d5\u05d6 \u05d9\u05e8\u05d5\u05e9\u05dc\u05d9\u05dd, NO, \u05d9\u05e9\u05e8\u05d0\u05dc", "31.79185550", "35.24472300", "edu", "", "Israel", "2018"], ["Multimodal Learning and Reasoning for Visual Question Answering", "", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2017"], ["Domain Adapted Word Embeddings for Improved Sentiment Classification", "", "University of Wisconsin Madison", "University of Wisconsin Madison", "University of Wisconsin-Madison, Marsh Lane, Madison, Dane County, Wisconsin, 53705-2221, USA", "43.07982815", "-89.43066425", "edu", "", "United States", "2018"], ["Uncovering Temporal Context for Video Question and Answering", "", "University of Technology Sydney", "University of Technology Sydney", "University of Technology Sydney, Omnibus Lane, Ultimo, Sydney, NSW, 2007, Australia", "-33.88096510", "151.20107299", "edu", "", "Australia", "2015"], ["We are Humor Beings: Understanding and Predicting Visual Humor", "", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2016"], ["COCO Attributes: Attributes for People, Animals, and Objects", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2016"], ["Learning Articulated Object Models from Language and Vision", "", "University of Rochester", "University of Rochester", "Memorial Art Gallery, 500, University Avenue, East End, Rochester, Monroe County, New York, 14607, USA", "43.15769690", "-77.58829158", "edu", "", "United States", "2017"], ["Resolving vision and language ambiguities together: Joint segmentation & prepositional attachment resolution in captioned scenes", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Query-Guided Regression Network with Context Policy for Phrase Grounding", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2017"], ["Object-Difference Attention: A Simple Relational Attention for Visual Question Answering", "Beijing University of Posts and Telecommunications, Beijing, China", "Beijing University of Posts and Telecommunications", "Beijing University of Posts and Telecommunications", "\u5317\u4eac\u90ae\u7535\u5927\u5b66, \u897f\u571f\u57ce\u8def, \u6d77\u6dc0\u533a, \u5317\u4eac\u5e02, 100082, \u4e2d\u56fd", "39.96014880", "116.35193921", "edu", "", "China", "2018"], ["TGIF-QA: Toward Spatio-Temporal Reasoning in Visual Question Answering", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2017"], ["Graph R-CNN for Scene Graph Generation", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["Attacking Visual Language Grounding with Adversarial Examples: A Case Study on Neural Image Captioning", "", "IBM Research, North Carolina", "IBM Research", "IBM, East Cornwallis Road, Research Triangle Park, Nelson, Durham County, North Carolina, 27709, USA", "35.90422720", "-78.85565763", "company", "", "United States", "2018"], ["Vision and Language Integration: Moving beyond Objects", "", "University of Trento", "University of Trento", "University of Trento, Via Giuseppe Verdi, Piedicastello, Trento, Territorio Val d'Adige, TN, TAA, 38122, Italia", "46.06588360", "11.11598940", "edu", "", "Italy", "2017"], ["Supervised and Unsupervised Transfer Learning for Question Answering", "", "National Taiwan University", "National Taiwan University", "\u81fa\u5927;\u53f0\u5927, 1, \u7f85\u65af\u798f\u8def\u56db\u6bb5, \u5b78\u5e9c\u91cc, \u5927\u5b89\u5340, \u81fa\u5317\u5e02, 10617, \u81fa\u7063", "25.01682835", "121.53846924", "edu", "", "Taiwan", "2018"], ["Describing Common Human Visual Actions in Images", "", "California Institute of Technology", "California Institute of Technology", "California Institute of Technology, San Pasqual Walk, Madison Heights, Pasadena, Los Angeles County, California, 91126, USA", "34.13710185", "-118.12527487", "edu", "", "United States", "2015"], ["Survey of Recent Advances in Visual Question Answering", "", "Adobe Systems", "Adobe Systems", "343 Preston St, Ottawa, ON K1S 1N4, Canada", "45.40242130", "-75.70955410", "edu", "", "South Korea", "2017"], ["Look Before You Leap: Bridging Model-Free and Model-Based Reinforcement Learning for Planned-Ahead Vision-and-Language Navigation", "", "University of California, Santa Barbara", "University of California, Santa Barbara", "UCSB, Santa Barbara County, California, 93106, USA", "34.41459370", "-119.84581950", "edu", "", "United States", "2018"], ["Deep Learning in Microscopy Image Analysis: A Survey", "J. Crayton Pruitt Family Department of Biomedical Engineering, University of Florida, Gainesville, FL, USA", "University of Florida", "University of Florida", "University of Florida, Southwest 16th Avenue, Diamond Village Apartments, City of Gainesville Municipal Boundaries, Alachua County, Florida, 32611, USA", "29.63287840", "-82.34901330", "edu", "", "United States", "2018"], ["A Taxonomy of Deep Convolutional Neural Nets for Computer Vision", "", "University of Surrey", "University of Surrey", "University of Surrey, Spine Road, Guildford Park, Guildford, Surrey, South East, England, GU2 7XH, UK", "51.24303255", "-0.59001382", "edu", "", "United Kingdom", "2016"], ["Aligned Image-Word Representations Improve Inductive Transfer Across Vision-Language Tasks", "", "Google", "Google, Inc.", "1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA", "37.42199990", "-122.08405750", "company", "Google, Mountain View, CA", "United States", "2017"], ["Compositional Obverter Communication Learning From Raw Visual Input", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["VSE++: Improving Visual-Semantic Embeddings with Hard Negatives", "", "University of Toronto", "University of Toronto", "University of Toronto, St. George Street, Bloor Street Culture Corridor, Old Toronto, Toronto, Ontario, M5S 1A5, Canada", "43.66333345", "-79.39769975", "edu", "", "Canada", "2018"], ["Benchmark Visual Question Answer Models by using Focus Map", "", "Shanghai Jiaotong University", "Shanghai Jiaotong University", "China, Shanghai, Minhang, \u4e1c\u5ddd\u8def \u90ae\u653f\u7f16\u7801: 200240", "31.02522010", "121.43377840", "edu", "", "China", "2018"], ["DeepFuse: A Deep Unsupervised Approach for Exposure Fusion with Extreme Exposure Image Pairs", "", "Indian Institute of Science Bangalore", "Indian Institute of Science Bangalore", "IISc, Gulmohar Marg, RMV Stage II - 1st Block, Aramane Nagara Ward, West Zone, Bengaluru, Bangalore Urban, Karnataka, 560012, India", "13.02223470", "77.56718325", "edu", "", "India", "2017"], ["Structured Attentions for Visual Question Answering", "", "ShanghaiTech University", "ShanghaiTech University", "Yueyang Rd, Xuhui Qu, Shanghai Shi, China", "31.20254500", "121.45308600", "edu", "", "", "2017"], ["Visual Curiosity: Learning to Ask Questions to Learn Visual Recognition", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["Generation and Comprehension of Unambiguous Object Descriptions", "", "University of Oxford", "University of Oxford", "Radcliffe Camera, Radcliffe Square, Grandpont, Oxford, Oxon, South East, England, OX1 4AJ, UK", "51.75345380", "-1.25400997", "edu", "", "United Kingdom", "2016"], ["Comparatives, Quantifiers, Proportions: A Multi-Task Model for the Learning of Quantities from Vision", "", "University of Trento", "University of Trento", "University of Trento, Via Giuseppe Verdi, Piedicastello, Trento, Territorio Val d'Adige, TN, TAA, 38122, Italia", "46.06588360", "11.11598940", "edu", "", "Italy", "2018"], ["Paying Attention to Descriptions Generated by Image Captioning Models", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2017"], ["Object Relation Detection Based on One-shot Learning", "", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2018"], ["Explore Multi-Step Reasoning in Video Question Answering", "Tianjin University, Tianjin, China", "Tianjin University", "Tianjin University", "\u6cf0\u5c71\u822a\u7a7a\u6e2f/\u5929\u6d25\u5927\u53a6, \u67a3\u884c\u8def, \u67a3\u884c \u9ad8\u738b\u5bfa, \u957f\u57ce\u8def, \u5927\u6cb3, \u5cb1\u5cb3\u533a (Daiyue), \u6cf0\u5b89\u5e02, \u5c71\u4e1c\u7701, 271000, \u4e2d\u56fd", "36.20304395", "117.05842113", "edu", "", "China", "2018"], ["Reward Learning from Narrated Demonstrations", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2018"], ["Learning Deep Structure-Preserving Image-Text Embeddings", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2016"], ["Question Part Relevance and Editing for Cooperative and Context-Aware VQA (C2VQA)", "Universit\u00e0 di Salerno, Dipartimento di Informatica, Fisciano, Italy", "Universit\u00e0 di Salerno, Dipartimento di Informatica, Fisciano, Italy", "Universit\u00e0 di Salerno, Dipartimento di Informatica, Fisciano, Italy", "Universit\u00e0 di Salerno Via Papa, Via Giovanni Paolo II, 132, 84084 Fisciano SA, Italy", "40.77434920", "14.78901500", "edu", "", "Italy", "2017"], ["Emotional Dialogue Generation using Image-Grounded Language Models", "Microsoft Research, Redmond, WA, USA", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2018"], ["Person Re-Identification with Vision and Language", "", "University of Surrey", "University of Surrey", "University of Surrey, Spine Road, Guildford Park, Guildford, Surrey, South East, England, GU2 7XH, UK", "51.24303255", "-0.59001382", "edu", "", "United Kingdom", "2018"], ["Object Referring in Videos with Language and Human Gaze", "", "ETH Zurich", "ETH Zurich", "R\u00e4mistrasse 101, 8092 Z\u00fcrich, Switzerland", "47.37631300", "8.54766990", "edu", "", "Switzerland", "2018"], ["Modeling Image Virality with Pairwise Spatial Transformer Networks", "", "MIT", "Massachusetts Institute", "MIT, Amherst Street, Cambridgeport, Cambridge, Middlesex County, Massachusetts, 02238, USA", "42.35839610", "-71.09567788", "edu", "", "United States", "2017"], ["Towards Literate Artificial Intelligence", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2018"], ["Recurrent Models for Situation Recognition", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2017"], ["Video Fill In the Blank Using LR/RL LSTMs with Spatial-Temporal Attentions", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2017"], ["A simple neural network module for relational reasoning", "", "London, United Kingdom", "London, United Kingdom", "London, Greater London, England, SW1A 2DU, UK", "51.50732190", "-0.12764740", "edu", "", "United Kingdom", "2017"], ["Neural Perspective to Jigsaw Puzzle Solving", "", "IIT Kanpur", "IIT Kanpur", "Kalyanpur, Kanpur, Uttar Pradesh 208016, India", "26.51233880", "80.23290000", "edu", "", "India", "2016"], ["Multimodal Hierarchical Reinforcement Learning Policy for Task-Oriented Visual Dialog", "", "University of California, Davis", "University of California, Davis", "University of California, Davis, Apiary Drive, Yolo County, California, 95616-5270, USA", "38.53363490", "-121.79077264", "edu", "", "United States", "2018"], ["Cross-Dataset Adaptation for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["AnchorNet: A Weakly Supervised Network to Learn Geometry-Sensitive Features for Semantic Matching", "", "University of Oxford", "University of Oxford", "Radcliffe Camera, Radcliffe Square, Grandpont, Oxford, Oxon, South East, England, OX1 4AJ, UK", "51.75345380", "-1.25400997", "edu", "", "United Kingdom", "2017"], ["Visual Question Answering with Memory-Augmented Networks", "", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2017"], ["Adversarial Attacks Beyond the Image Space", "", "Hong Kong University of Science and Technology", "Hong Kong University of Science and Technology", "\u9999\u6e2f\u79d1\u6280\u5927\u5b78 Hong Kong University of Science and Technology, \u5927\u5b78\u9053 University Road, \u5927\u57d4\u4ed4 Tai Po Tsai, \u5927\u57d4\u4ed4\u6751 Tai Po Tsai Village, \u65b0\u754c New Territories, HK, DD253 1209, \u4e2d\u56fd", "22.33863040", "114.26203370", "edu", "", "China", "2017"], ["Learning Models for Actions and Person-Object Interactions with Transfer to Question Answering", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2016"], ["The Social Picture", "University of Catania, Catania, Italy", "University of Catania", "Electronics and Computer Engineering, University of Catania, Catania, Italy", "Via S. Sofia, 64, 95125 Catania CT, Italy", "37.52442420", "15.06991700", "edu", "", "Italy", "2016"], ["A Dataset and Exploration of Models for Understanding Video Data through Fill-in-the-Blank Question-Answering", "", "Polytechnique Montreal", "Polytechnique Montr\u00b4eal", "2900 Boulevard Edouard-Montpetit, Montr\u00e9al, QC H3T 1J4, Canada", "45.50438400", "-73.61288290", "edu", "Polytechnique Montreal, Montreal, Quebec, Canada", "Canada", "2017"], ["What is in that picture ? Visual Question Answering System", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Joint Parsing of Cross-view Scenes with Spatio-temporal Semantic Parse Graphs", "", "University of California", "University of California", "Berkeley, CA, USA", "37.87189920", "-122.25853990", "edu", "", "United States", "2017"], ["Tensorize , Factorize and Regularize : Robust Visual Relationship Learning", "", "University of Wisconsin Madison", "University of Wisconsin Madison", "University of Wisconsin-Madison, Marsh Lane, Madison, Dane County, Wisconsin, 53705-2221, USA", "43.07982815", "-89.43066425", "edu", "", "United States", ""], ["Reciprocal Attention Fusion for Visual Question Answering", "", "Australian National University", "Australian National University", "Canberra ACT 0200, Australia", "-35.27769990", "149.11852700", "edu", "", "Australia", "2018"], ["Learning Knowledge Bases for Multimedia in 2015", "Facebook, Mountain View, USA", "Facebook", "Facebook", "250 Bryant St, Mountain View, CA 94041, USA", "37.39367170", "-122.08072620", "company", "Facebook, Mountain View, CA", "United States", "2015"], ["Spatial Knowledge Distillation to aid Visual Reasoning", "", "Arizona State University", "Arizona State University", "Arizona State University Polytechnic campus, East Texas Avenue, Mesa, Maricopa County, Arizona, 85212, USA", "33.30715065", "-111.67653157", "edu", "", "United States", "2018"], ["Data Hallucination , Falsification and Validation using Generative Models and Formal Methods by", "", "University of California, Berkeley", "University of California, Berkeley", "Berkeley Art Museum and Pacific Film Archive, Bancroft Way, Southside, Berkeley, Alameda County, California, 94720-1076, USA", "37.86871260", "-122.25586815", "edu", "", "United States", "2018"], ["Overview of ImageCLEF 2018: Challenges, Datasets and Evaluation", "", "University of Oslo", "University of Oslo", "UiO, Moltke Moes vei, Blindern, Nordre Aker, Oslo, 0851, Norge", "59.93891665", "10.72170765", "edu", "", "Norway", "2018"], ["Large Graph Exploration via Subgraph Discovery and Decomposition", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["Visual Reasoning with Natural Language", "", "Cornell University", "Cornell University", "Cornell University, Forest Home Drive, Forest Home, Tompkins County, New York, 14853, USA", "42.45055070", "-76.47835130", "edu", "", "United States", "2017"], ["\"Seeing is believing: the quest for multimodal knowledge\" by Gerard de Melo and Niket Tandon, with Martin Vesely as coordinator", "", "Max Planck Institute for Informatics", "Max Planck Institute for Informatics", "MPII, E1 4, Campus, Universit\u00e4t, Sankt Johann, Bezirk Mitte, Saarbr\u00fccken, Regionalverband Saarbr\u00fccken, Saarland, 66123, Deutschland", "49.25795660", "7.04577417", "edu", "", "Germany", "2016"], ["Neural Arithmetic Logic Units", "", "University of Oxford", "University of Oxford", "Radcliffe Camera, Radcliffe Square, Grandpont, Oxford, Oxon, South East, England, OX1 4AJ, UK", "51.75345380", "-1.25400997", "edu", "", "United Kingdom", "2018"], ["Visual Question Answering Using Various Methods", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2016"], ["CoQA: A Conversational Question Answering Challenge", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2018"], ["Video Question Answering via Hierarchical Spatio-Temporal Attention Networks", "", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2017"], ["An Analysis of Action Recognition Datasets for Language and Vision Tasks", "", "University of Edinburgh", "University of Edinburgh", "New College, New College Courtyard, The Mound, Old Town, Edinburgh, City of Edinburgh, Scotland, EH1 2LX, UK", "55.94951105", "-3.19534913", "edu", "", "United Kingdom", "2017"], ["Living a discrete life in a continuous world: Reference with distributed representations", "", "University of Trento", "University of Trento", "University of Trento, Via Giuseppe Verdi, Piedicastello, Trento, Territorio Val d'Adige, TN, TAA, 38122, Italia", "46.06588360", "11.11598940", "edu", "", "Italy", "2016"], ["Towards Transparent AI Systems: Interpreting Visual Question Answering Models", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Multimodal Explanations: Justifying Decisions and Pointing to the Evidence", "", "University of California, Berkeley", "University of California, Berkeley", "Berkeley Art Museum and Pacific Film Archive, Bancroft Way, Southside, Berkeley, Alameda County, California, 94720-1076, USA", "37.86871260", "-122.25586815", "edu", "", "United States", "2018"], ["Chess Q & A : Question Answering on Chess Games", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2015"], ["Active Learning for Visual Question Answering: An Empirical Study", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["Overcoming Language Priors in Visual Question Answering with Adversarial Regularization", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["What value high level concepts in vision to language problems ?", "", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2015"], ["Embedding Network for Visual Relation Detection", "", "Columbia University", "Columbia University", "Columbia University Medical Center, 630, West 168th Street, Washington Heights, Manhattan, Manhattan Community Board 12, New York County, NYC, New York, 10031, USA", "40.84198360", "-73.94368971", "edu", "", "United States", "2017"], ["Question-Guided Hybrid Convolution for Visual Question Answering", "", "Chinese University of Hong Kong", "Chinese University of Hong Kong", "Hong Kong, \u99ac\u6599\u6c34\u6c60\u65c1\u8def", "22.41626320", "114.21093180", "edu", "", "China", "2018"], ["Exploring Models and Data for Image Question Answering", "", "University of Toronto", "University of Toronto", "University of Toronto, St. George Street, Bloor Street Culture Corridor, Old Toronto, Toronto, Ontario, M5S 1A5, Canada", "43.66333345", "-79.39769975", "edu", "", "Canada", "2015"], ["Knowledge Acquisition for Visual Question Answering via Iterative Querying", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2017"], ["Deep Understanding of Cooking Procedure for Cross-modal Recipe Retrieval", "National University of Singapore, Singapore, Singapore", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2018"], ["Scene Graph Parsing as Dependency Parsing", "", "Johns Hopkins University", "Johns Hopkins University", "Baltimore, MD 21218, USA", "39.32990130", "-76.62051770", "edu", "", "", "2018"], ["Learning Cooperative Visual Dialog Agents with Deep Reinforcement Learning", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["ShapeWorld - A new test methodology for multimodal language understanding", "", "University of Cambridge", "University of Cambridge", "Clifford Allbutt Lecture Theatre, Robinson Way, Romsey, Cambridge, Cambridgeshire, East of England, England, CB2 0QH, UK", "52.17638955", "0.14308882", "edu", "", "United Kingdom", "2017"], ["Mean Box Pooling: A Rich Image Representation and Output Embedding for the Visual Madlibs Task", "", "Max Planck Institute for Informatics", "Max Planck Institute for Informatics", "MPII, E1 4, Campus, Universit\u00e4t, Sankt Johann, Bezirk Mitte, Saarbr\u00fccken, Regionalverband Saarbr\u00fccken, Saarland, 66123, Deutschland", "49.25795660", "7.04577417", "edu", "", "Germany", "2016"], ["TVT: Two-View Transformer Network for Video Captioning", "", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2018"], ["Multimodal Deep Learning using Images and Text for Information Graphic Classification", "Villanova University, Villanova, PA, USA", "Villanova University", "Villanova University", "Villanova University, East Lancaster Avenue, Radnor Township, Delaware County, Pennsylvania, 19010, USA", "40.03677740", "-75.34202332", "edu", "", "United States", "2018"], ["Visual Textbook Network: Watch Carefully before Answering Visual Questions", "", "Chinese Academy of Sciences", "Chinese Academy of Sciences", "\u4e2d\u56fd\u79d1\u5b66\u9662\u5fc3\u7406\u7814\u7a76\u6240, 16, \u6797\u8403\u8def, \u671d\u9633\u533a / Chaoyang, \u5317\u4eac\u5e02, 100101, \u4e2d\u56fd", "40.00447950", "116.37023800", "edu", "", "China", "2017"], ["Learning by Asking Questions", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["MovieQA: Understanding Stories in Movies through Question-Answering", "", "Karlsruhe Institute of Technology", "Karlsruhe Institute of Technology", "KIT, Leopoldshafener Allee, Linkenheim, Linkenheim-Hochstetten, Landkreis Karlsruhe, Regierungsbezirk Karlsruhe, Baden-W\u00fcrttemberg, 76351, Deutschland", "49.10184375", "8.43312560", "edu", "", "Germany", "2016"], ["A Better Way to Attend: Attention With Trees for Video Question Answering", "State Key Laboratory of CAD&CG, Zhejiang University, Hangzhou, China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2018"], ["A Joint Speaker-Listener-Reinforcer Model for Referring Expressions", "", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill, East Cameron Avenue, Chapel Hill, Orange County, North Carolina, 27514, USA", "35.91139710", "-79.05045290", "edu", "", "United States", "2017"], ["More Than An Answer: Neural Pivot Network for Visual Qestion Answering", "Xiamen University, Xiamen, China", "Xiamen University", "Xiamen University", "\u53a6\u95e8\u5927\u5b66, \u601d\u660e\u5357\u8def Siming South Road, \u601d\u660e\u533a, \u601d\u660e\u533a (Siming), \u53a6\u95e8\u5e02 / Xiamen, \u798f\u5efa\u7701, 361005, \u4e2d\u56fd", "24.43994190", "118.09301781", "edu", "", "China", "2017"], ["What's in a Question: Using Visual Questions as a Form of Supervision", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["A Neural Multi-sequence Alignment TeCHnique (NeuMATCH)", "", "Disney Research", "Disney Research", "521 Circle 7 Dr, Glendale, CA 91201, USA", "34.15797420", "-118.28947290", "company", "", "United States", "2018"], ["Modularity Matters: Learning Invariant Relational Reasoning Tasks", "", "Aalto University", "Aalto University", "Aalto, 24, Otakaari, Otaniemi, Suur-Tapiola, Espoo, Helsingin seutukunta, Uusimaa, Etel\u00e4-Suomi, Manner-Suomi, 02150, Suomi", "60.18558755", "24.82427330", "edu", "", "Finland", "2018"], ["MTLE: A Multitask Learning Encoder of Visual Feature Representations for Video and Movie Description", "", "Ohio State University", "The Ohio State University", "The Ohio State University, Woody Hayes Drive, Columbus, Franklin County, Ohio, 43210, USA", "40.00471095", "-83.02859368", "edu", "", "United States", "2018"], ["Temporal Modular Networks for Retrieving Complex Compositional Activities in Videos", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2018"], ["Multimodal Differential Network for Visual Question Generation", "", "Indian Institute of Technology Delhi", "IIIT-Delhi, India", "IIIT-Delhi, Mathura Road, Friends Colony, South East Delhi, Delhi, 110020, India", "28.54632595", "77.27325504", "edu", "", "India", "2018"], ["A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning", "", "KAIST", "KAIST", "291 Daehak-ro, Eoeun-dong, Yuseong-gu, Daejeon, South Korea", "36.37214270", "127.36039000", "edu", "", "South Korea", "2017"], ["Generative Attention Model with Adversarial Self-learning for Visual Question Answering", "National University of Singapore, Singapore, Singapore", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2017"], ["Visual Question Answering with Question Representation Update (QRU)", "", "Chinese University of Hong Kong", "Chinese University of Hong Kong", "Hong Kong, \u99ac\u6599\u6c34\u6c60\u65c1\u8def", "22.41626320", "114.21093180", "edu", "", "China", "2016"], ["Explainable Neural Computation via Stack Neural Module Networks", "", "Boston University", "Boston University", "BU, Bay State Road, Fenway, Boston, Suffolk County, Massachusetts, 02215, USA", "42.35042530", "-71.10056114", "edu", "", "United States", "2018"], ["Lessons Learned in Multilingual Grounded Language Learning", "", "University of Copenhagen", "University of Copenhagen", "K\u00f8benhavns Universitet, Krystalgade, K\u00f8dbyen, Vesterbro, K\u00f8benhavn, K\u00f8benhavns Kommune, Region Hovedstaden, 1165, Danmark", "55.68015020", "12.57232700", "edu", "", "Denmark", "2018"], ["Zero-Shot Visual Question Answering", "", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2016"], ["Deep Multi-Modal Image Correspondence Learning", "", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2016"], ["Learning Answer Embeddings for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Visual Question Answer Diversity", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2018"], ["Assignment 4 : Reading Comprehension", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Combating Human Trafficking with Deep Multimodal Models", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Bidirectional Attention Flow for Machine Comprehension", "", "University of Washington", "University of Washington", "University of Washington, Rainier Vista, Montlake, University District, Seattle, King County, Washington, 98195, USA", "47.65432380", "-122.30800894", "edu", "", "United States", "2016"], ["Inverse Visual Question Answering", "", "Southeast University", "Southeast University", "SEU, \u4f53\u80b2\u9986\u8def, \u65b0\u8857\u53e3, \u6708\u5b63\u56ed, \u7384\u6b66\u533a, \u5357\u4eac\u5e02, \u6c5f\u82cf\u7701, 210008, \u4e2d\u56fd", "32.05752790", "118.78682252", "edu", "", "China", ""], ["Improving Deep Visual Representation for Person Re-identification by Global and Local Image-language Association", "", "SenseTime", "SenseTime", "China, Beijing Shi, Haidian Qu, WuDaoKou, Zhongguancun E Rd, 1\u53f7-7", "39.99300800", "116.32988200", "company", "1 Zhongguancun E Rd, Haidian Qu, China", "China", "2018"], ["Leveraging Video Descriptions to Learn Video Question Answering", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Enhancing Visual Question Answering Using Dropout", "University of Chinese Academy of Sciences, Beijing, China", "University of Chinese Academy of Sciences", "University of Chinese Academy of Sciences", "University of Chinese Academy of Sciences, UCAS, Yuquanlu, \u7389\u6cc9\u8def, \u7530\u6751, \u6d77\u6dc0\u533a, 100049, \u4e2d\u56fd", "39.90828040", "116.24585270", "edu", "", "China", "2018"], ["Analyzing the Behavior of Visual Question Answering Models", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2016"], ["Video question answering via multi-granularity temporal attention network learning", "Zhejiang University, HIKVISION", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2018"], ["Inferring and Executing Programs for Visual Reasoning", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Adversarial Learning of Answer-Related Representation for Visual Question Answering", "Beihang University, Beijing, China", "Beihang University", "Beihang University", "\u5317\u4eac\u822a\u7a7a\u822a\u5929\u5927\u5b66, 37, \u5b66\u9662\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100083, \u4e2d\u56fd", "39.98083330", "116.34101249", "edu", "", "China", "2018"], ["Do Explanations make VQA Models more Predictable to a Human?", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["Grounded Textual Entailment", "", "University of Malta", "University of Malta", "University of Malta, Ring Road, Japanese Garden, L-Imsida, Malta, MSD 9027, Malta", "35.90232260", "14.48341890", "edu", "", "Malta", "2018"], ["DVQA: Understanding Data Visualizations via Question Answering", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2018"], ["Visual7W: Grounded Question Answering in Images", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2016"], ["Diverse and Coherent Paragraph Generation from Images", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2018"], ["Visual Dialog", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["ANALYZING HANDS WITH FIRST-PERSON COMPUTER VISION", "", "Indiana University", "Indiana University", "Indiana University East, West Cart Road, Richmond, Wayne County, Indiana, 47374, USA", "39.86948105", "-84.87956905", "edu", "", "United States", "2016"], ["BundleNet: Learning with Noisy Label via Sample Correlations", "Chinese Academy of Sciences, Institute of Automation, Beijing, China", "Chinese Academy of Sciences", "Chinese Academy of Sciences", "\u4e2d\u56fd\u79d1\u5b66\u9662\u5fc3\u7406\u7814\u7a76\u6240, 16, \u6797\u8403\u8def, \u671d\u9633\u533a / Chaoyang, \u5317\u4eac\u5e02, 100101, \u4e2d\u56fd", "40.00447950", "116.37023800", "edu", "", "China", "2018"], ["Deep Compositional Question Answering with Neural Module Networks", "", "University of California, Berkeley", "University of California, Berkeley", "Berkeley Art Museum and Pacific Film Archive, Bancroft Way, Southside, Berkeley, Alameda County, California, 94720-1076, USA", "37.86871260", "-122.25586815", "edu", "", "United States", "2015"], ["Faithful Multimodal Explanation for Visual Question Answering", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2018"], ["Zero-Shot Transfer VQA Dataset", "", "Baidu Research, USA", "Baidu Research, USA", "1195 Bordeaux Dr, Sunnyvale, CA 94089, USA", "37.40922650", "-122.02366150", "company", "", "United States", "2018"], ["Grounding Referring Expressions in Images by Variational Context", "", "Nanyang Technological University", "Nanyang Technological University", "NTU, Faculty Avenue, Jurong West, Southwest, 637460, Singapore", "1.34841040", "103.68297965", "edu", "", "Singapore", "2017"], ["Visual Question Answering Dataset for Bilingual Image Understanding: A Study of Cross-Lingual Transfer Using Attention Maps", "", "Tokyo Institute of Technology", "Tokyo Institute of Technology", "\u6771\u4eac\u5de5\u696d\u5927\u5b66, \u539a\u6728\u8857\u9053, \u7dd1\u533a, \u753a\u7530\u5e02, \u795e\u5948\u5ddd\u770c, \u95a2\u6771\u5730\u65b9, 226-0026, \u65e5\u672c", "35.51675380", "139.48342251", "edu", "", "Japan", "2018"], ["exposure phase : images of individuals with one linguistic attribute query phase : select instructed and evaluated barkeeper instruct evaluate instruct amuseinstruct evaluate correct answer", "", "University of Trento", "University of Trento", "University of Trento, Via Giuseppe Verdi, Piedicastello, Trento, Territorio Val d'Adige, TN, TAA, 38122, Italia", "46.06588360", "11.11598940", "edu", "", "Italy", "2017"], ["Visual Text Correction", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2018"], ["Classifying Community QA Questions That Contain an Image", "Waseda University, Tokyo, Japan", "Waseda University", "Waseda University", "\u65e9\u7a32\u7530\u5927\u5b66 \u5317\u4e5d\u5dde\u30ad\u30e3\u30f3\u30d1\u30b9, 2-2, \u6709\u6bdb\u5f15\u91ce\u7dda, \u516b\u5e61\u897f\u533a, \u5317\u4e5d\u5dde\u5e02, \u798f\u5ca1\u770c, \u4e5d\u5dde\u5730\u65b9, 808-0135, \u65e5\u672c", "33.88987280", "130.70856205", "edu", "", "Japan", "2018"], ["Examine before You Answer: Multi-task Learning with Adaptive-attentions for Multiple-choice VQA", "University of Electronic Science and Technology of China, Chengdu, China", "University of Electronic Science and Technology of China", "University of Electronic Science and Technology of China", "Columbus, OH 43210, USA", "40.01419050", "-83.03091430", "edu", "", "United States", "2018"], ["Neural-Symbolic VQA: Disentangling Reasoning from Vision and Language Understanding", "", "MIT CSAIL", "MIT CSAIL", "32 Vassar St, Cambridge, MA 02139, USA", "42.36194070", "-71.09043780", "edu", "", "United States", "2018"], ["Grad-CAM: Visual Explanations from Deep Networks via Gradient-Based Localization", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["Towards a Better Metric for Evaluating Question Generation Systems", "", "IIT Madras, India", "IIT Madras, India", "Indian Institute Of Technology, Chennai, Tamil Nadu 600036, India", "12.99149290", "80.23369070", "edu", "", "", "2018"], ["Visual Explanations from Hadamard Product in Multimodal Deep Networks", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2017"], ["Answerer in Questioner's Mind: Information Theoretic Approach to Goal-Oriented Visual Dialog", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2018"], ["Situation Recognition with Graph Neural Networks", "", "Chinese University of Hong Kong", "Chinese University of Hong Kong", "Hong Kong, \u99ac\u6599\u6c34\u6c60\u65c1\u8def", "22.41626320", "114.21093180", "edu", "", "China", "2017"], ["Bridging Languages through Images with Deep Partial Canonical Correlation Analysis", "", "University of Cambridge", "University of Cambridge", "Clifford Allbutt Lecture Theatre, Robinson Way, Romsey, Cambridge, Cambridgeshire, East of England, England, CB2 0QH, UK", "52.17638955", "0.14308882", "edu", "", "United Kingdom", "2018"], ["Unifying the Video and Question Attentions for Open-Ended Video Question Answering", "State Key Laboratory of CAD&CG, Zhejiang University, Hangzhou, China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2017"], ["Auto-Classification of Retinal Diseases in the Limit of Sparse Data Using a Two-Streams Machine Learning Model", "", "National Taiwan University", "National Taiwan University", "\u81fa\u5927;\u53f0\u5927, 1, \u7f85\u65af\u798f\u8def\u56db\u6bb5, \u5b78\u5e9c\u91cc, \u5927\u5b89\u5340, \u81fa\u5317\u5e02, 10617, \u81fa\u7063", "25.01682835", "121.53846924", "edu", "", "Taiwan", "2018"], ["Crowdsourcing Question-Answer Meaning Representations", "", "University of Washington", "University of Washington", "University of Washington, Rainier Vista, Montlake, University District, Seattle, King County, Washington, 98195, USA", "47.65432380", "-122.30800894", "edu", "", "United States", "2018"], ["CoDraw: Visual Dialog for Collaborative Drawing", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2017"], ["Context-Aware Captions from Context-Agnostic Supervision", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Middle-Out Decoding", "", "University of British Columbia", "University of British Columbia", "University of British Columbia, Eagles Drive, Hawthorn Place, University Endowment Lands, Metro Vancouver, British Columbia, V6T, Canada", "49.25839375", "-123.24658161", "edu", "", "Canada", "2018"], ["Contextualized Bilinear Attention Networks", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2018"], ["Image Caption Validation", "", "Princeton University", "Princeton University", "Lot 9, University Place, Princeton Township, Mercer County, New Jersey, 08540, USA", "40.34829285", "-74.66308325", "edu", "", "United States", "2018"], ["Grad-CAM: Why did you say that? Visual Explanations from Deep Networks via Gradient-based Localization", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Show, Adapt and Tell: Adversarial Training of Cross-Domain Image Captioner", "", "Microsoft Research Asia", "Microsoft Research Asia", "1 Memorial Dr, Cambridge, MA 02142, USA", "42.36142560", "-71.08120920", "company", "", "United States", "2017"], ["DeepSIC: Deep Semantic Image Compression", "", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2018"], ["Neural Self Talk: Image Understanding via Continuous Questioning and Answering", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2015"], ["Learning to Count Objects in Natural Images for Visual Question Answering", "", "University of Southampton", "University of Southampton", "Waterfront Campus, European Way, Port of Southampton, St Mary's, Southampton, South East, England, SO14 3JW, UK", "50.89273635", "-1.39464295", "edu", "", "United Kingdom", "2018"], ["ViP-CNN: Visual Phrase Guided Convolutional Neural Network", "", "Hong Kong", "Hong Kong", "Hong Kong", "22.39642800", "114.10949700", "edu", "", "China", "2017"], ["JUST at VQA-Med: A VGG-Seq2Seq Model", "", "Jordan University of Science and Technology", "Jordan University of Science and Technology", "Jordan University of Science and Technology, \u0634\u0627\u0631\u0639 \u0627\u0644\u0623\u0631\u062f\u0646, \u0625\u0631\u0628\u062f\u200e, \u0625\u0631\u0628\u062f, \u0627\u0644\u0623\u0631\u062f\u0646", "32.49566485", "35.99160717", "edu", "", "Jordan", "2018"], ["Context-Aware Visual Policy Network for Sequence-Level Image Captioning", "", "Nanyang Technological University", "Nanyang Technological University", "NTU, Faculty Avenue, Jurong West, Southwest, 637460, Singapore", "1.34841040", "103.68297965", "edu", "", "Singapore", "2018"], ["Residual Attention Networks for Image Classification", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Multimodal Attribute Extraction", "", "University of California", "University of California", "Berkeley, CA, USA", "37.87189920", "-122.25853990", "edu", "", "United States", "2017"], ["Toward Driving Scene Understanding: A Dataset for Learning Driver Behavior and Causal Reasoning", "", "Boston University", "Boston University", "BU, Bay State Road, Fenway, Boston, Suffolk County, Massachusetts, 02215, USA", "42.35042530", "-71.10056114", "edu", "", "United States", "2018"], ["Scene Graph Generation from Objects, Phrases and Region Captions", "", "MIT", "Massachusetts Institute", "MIT, Amherst Street, Cambridgeport, Cambridge, Middlesex County, Massachusetts, 02238, USA", "42.35839610", "-71.09567788", "edu", "", "United States", "2017"], ["Image Understanding using vision and reasoning through Scene Description Graph", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2018"], ["Don't Just Assume; Look and Answer: Overcoming Priors for Visual Question Answering", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Software-Hardware Codesign for Efficient Neural Network Acceleration", "University of California, Santa Barbara", "University of California, Santa Barbara", "University of California, Santa Barbara", "UCSB, Santa Barbara County, California, 93106, USA", "34.41459370", "-119.84581950", "edu", "", "United States", "2017"], ["Supplementary Material : Cross-Dataset Adaptation for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Multi-Networks Joint Learning for Large-Scale Cross-Modal Retrieval", "University of Texas at San Antonio, San Antonio, TX, USA", "University of Texas at San Antonio", "University of Texas at San Antonio", "UTSA, Paseo Principal, San Antonio, Bexar County, Texas, 78249-1620, USA", "29.58333105", "-98.61944505", "edu", "", "United States", "2017"], ["Differential Attention for Visual Question Answering", "", "IIT Kanpur", "IIT Kanpur", "Kalyanpur, Kanpur, Uttar Pradesh 208016, India", "26.51233880", "80.23290000", "edu", "", "India", "2018"], ["Spatial Memory for Context Reasoning in Object Detection", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Totally Looks Like - How Humans Compare, Compared to Machines", "", "York University", "York University", "York University, Keele Campus, Campus Walk, North York, Toronto, Ontario, M3J 2S5, Canada", "43.77439110", "-79.50481085", "edu", "", "Canada", "2018"], ["Improved Fusion of Visual and Language Representations by Dense Symmetric Co-Attention for Visual Question Answering", "", "Tohoku University", "Tohoku University", "Tohoku University, \u4e94\u6a4b\u901a, \u9752\u8449\u533a, \u4ed9\u53f0\u5e02, \u5bae\u57ce\u770c, \u6771\u5317\u5730\u65b9, 980-0811, \u65e5\u672c", "38.25309450", "140.87365930", "edu", "", "Japan", "2018"], ["Textually Enriched Neural Module Networks for Visual Question Answering", "", "Carnegie Mellon University", "Carnegie Mellon University Pittsburgh, PA - 15213, USA", "Carnegie Mellon University, Forbes Avenue, Squirrel Hill North, PGH, Allegheny County, Pennsylvania, 15213, USA", "40.44416190", "-79.94272826", "edu", "", "United States", "2018"], ["Deep Cross-Modal Projection Learning for Image-Text Matching", "", "Dalian University of Technology", "Dalian University of Technology", "\u5927\u8fde\u7406\u5de5\u5927\u5b66, \u7ea2\u51cc\u8def, \u7518\u4e95\u5b50\u533a, \u51cc\u6c34\u9547, \u7518\u4e95\u5b50\u533a / Ganjingzi, \u5927\u8fde\u5e02 / Dalian, \u8fbd\u5b81\u7701, 116023, \u4e2d\u56fd", "38.88140235", "121.52281098", "edu", "", "China", "2018"], ["Cross-media analysis and reasoning: advances and directions", "School of Electronics Engineering and Computer Science, Peking University, Beijing, China", "Peking University", "Peking University", "\u5317\u4eac\u5927\u5b66, 5\u53f7, \u9890\u548c\u56ed\u8def, \u7a3b\u9999\u56ed\u5357\u793e\u533a, \u6d77\u6dc0\u533a, \u5317\u4eac\u5e02, 100871, \u4e2d\u56fd", "39.99223790", "116.30393816", "edu", "", "China", "2017"], ["Learning to hash-tag videos with Tag2Vec", "", "CVIT, KCIS, IIIT Hyderabad, India", "CVIT, KCIS, IIIT Hyderabad, India", "IIIT, Gachibowli, Gachibowli, Hyderabad, Telangana 500032, India", "17.44509810", "78.34976780", "edu", "", "India", "2016"], ["Describing Natural Images Containing Novel Objects with Knowledge Guided Assitance", "", "Karlsruhe Institute of Technology", "Karlsruhe Institute of Technology", "KIT, Leopoldshafener Allee, Linkenheim, Linkenheim-Hochstetten, Landkreis Karlsruhe, Regierungsbezirk Karlsruhe, Baden-W\u00fcrttemberg, 76351, Deutschland", "49.10184375", "8.43312560", "edu", "", "Germany", "2017"], ["Video Question Answering via Hierarchical Dual-Level Attention Network Learning", "Zhejiang University, Hang Zhou, China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2017"], ["Survey of the State of the Art in Natural Language Generation: Core tasks, applications and evaluation", "", "University of Malta", "University of Malta", "University of Malta, Ring Road, Japanese Garden, L-Imsida, Malta, MSD 9027, Malta", "35.90232260", "14.48341890", "edu", "", "Malta", "2018"], ["User-guided Hierarchical Attention Network for Multi-modal Social Image Popularity Prediction", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["Criteria for Human-Compatible AI in Two-Player Vision-Language Tasks", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2017"], ["MSRC: Multimodal Spatial Regression with Semantic Context for Phrase Grounding", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2017"], ["Measuring Machine Intelligence Through Visual Question Answering", "", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2016"], ["Learning Robust, Transferable Sentence Representations for Text Classification", "", "University of California, Los Angeles", "University of California, Los Angeles", "200 UCLA, Medical Plaza Driveway Suite 540, Los Angeles, CA 90095, USA", "34.06877880", "-118.44500940", "edu", "", "United States", "2018"], ["DualNet: Domain-invariant network for visual question answering", "", "University of Tokyo", "University of Tokyo", "\u6771\u4eac\u5927\u5b66 \u67cf\u30ad\u30e3\u30f3\u30d1\u30b9, \u5b66\u878d\u5408\u306e\u9053, \u67cf\u5e02, \u5343\u8449\u770c, \u95a2\u6771\u5730\u65b9, 277-8583, \u65e5\u672c", "35.90204480", "139.93622009", "edu", "", "Japan", "2017"], ["Simple Baseline for Visual Question Answering", "", "MIT", "Massachusetts Institute", "MIT, Amherst Street, Cambridgeport, Cambridge, Middlesex County, Massachusetts, 02238, USA", "42.35839610", "-71.09567788", "edu", "", "United States", "2015"], ["What Action Causes This? Towards Naive Physical Action-Effect Prediction", "", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2018"], ["Simple and effective visual question answering in a single modality", "Zhejiang University, College of Computer Science, Hangzhou, P. R. China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2016"], ["Structured Triplet Learning with POS-Tag Guided Attention for Visual Question Answering", "", "ETH Zurich", "ETH Zurich", "R\u00e4mistrasse 101, 8092 Z\u00fcrich, Switzerland", "47.37631300", "8.54766990", "edu", "", "Switzerland", "2018"], ["VizWiz Grand Challenge: Answering Visual Questions from Blind People", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2018"], ["Goal Driven Detection in Natural Scenes", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2018"], ["GLAC Net: GLocal Attention Cascading Networks for Multi-image Cued Story Generation", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2018"], ["Deep Attention Neural Tensor Network for Visual Question Answering", "", "Microsoft Research Asia", "Microsoft Research Asia", "1 Memorial Dr, Cambridge, MA 02142, USA", "42.36142560", "-71.08120920", "company", "", "United States", "2018"], ["Incorporating Deep Visual Features into Multiobjective based Multi-view Search Results Clustering", "", "Dublin City University", "DUBLIN CITY UNIVERSITY", "Dublin City University Glasnevin Campus, Lower Car Park, Wad, Whitehall A ED, Dublin 9, Dublin, County Dublin, Leinster, D09 FW22, Ireland", "53.38522185", "-6.25740874", "edu", "", "Ireland", "2018"], ["Deep learning evaluation using deep linguistic processing", "", "University of Cambridge", "University of Cambridge", "Clifford Allbutt Lecture Theatre, Robinson Way, Romsey, Cambridge, Cambridgeshire, East of England, England, CB2 0QH, UK", "52.17638955", "0.14308882", "edu", "", "United Kingdom", "2017"], ["Tips and Tricks for Visual Question Answering: Learnings from the 2017 Challenge", "", "Australian National University", "Australian National University", "Canberra ACT 0200, Australia", "-35.27769990", "149.11852700", "edu", "", "Australia", "2017"], ["Explaining Explanations: An Approach to Evaluating Interpretability of Machine Learning", "", "MIT", "Massachusetts Institute", "MIT, Amherst Street, Cambridgeport, Cambridge, Middlesex County, Massachusetts, 02238, USA", "42.35839610", "-71.09567788", "edu", "", "United States", "2018"], ["A Joint Sequence Fusion Model for Video Question Answering and Retrieval", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2018"], ["Visual Question Answering: A Tutorial", "Australian Centre for Visual Technologies, University of Adelaide, 5005 South Australia, Australia", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2017"], ["iVQA: Inverse Visual Question Answering", "", "Southeast University", "Southeast University", "SEU, \u4f53\u80b2\u9986\u8def, \u65b0\u8857\u53e3, \u6708\u5b63\u56ed, \u7384\u6b66\u533a, \u5357\u4eac\u5e02, \u6c5f\u82cf\u7701, 210008, \u4e2d\u56fd", "32.05752790", "118.78682252", "edu", "", "China", "2017"], ["Dynamic Graph Generation Network: Generating Relational Knowledge from Diagrams", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2017"], ["Incorporating External Knowledge to Answer Open-Domain Visual Questions with Dynamic Memory Networks", "", "Tsinghua University", "Tsinghua University", "\u6e05\u534e\u5927\u5b66, 30, \u53cc\u6e05\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100084, \u4e2d\u56fd", "40.00229045", "116.32098908", "edu", "", "China", "2017"], ["Task-driven Visual Saliency and Attention-based Visual Question Answering", "", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2017"], ["TasvirEt: A benchmark dataset for automatic Turkish description generation from images", "Hacettepe Bilgisayarl\u0131 G\u00f6r\u00fc Laboratuvar\u0131 (HUCVL), Hacettepe \u00dcniversitesi, Turkey", "Bilgisayar M\u00fch. B\u00f6l\u00fcm\u00fc, Orta Do\u011fu Teknik \u00dcniversitesi, Turkey", "Bilgisayar M\u00fch. B\u00f6l\u00fcm\u00fc, Orta Do\u011fu Teknik \u00dcniversitesi, Turkey", "\u00dcniversiteler Mh., Dumlup\u0131nar Blv. No:1, 06800 \u00c7ankaya/Ankara, Turkey", "39.89102030", "32.77800270", "edu", "", "", "2016"], ["Motion-Appearance Co-Memory Networks for Video Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Learning to Attend On Essential Terms: An Enhanced Retriever-Reader Model for Scientific Question Answering", "", "UC San Diego", "UC San Diego", "9500 Gilman Dr, La Jolla, CA 92093, USA", "32.88006040", "-117.23401350", "edu", "", "United States", "2018"], ["Proposal Incorporating Structural Bias into Neural Networks", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Reducing Overfitting in Deep Networks by Decorrelating Representations", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Fusing attention with visual question answering", "Computational NeuroEngineering Laboratory, Department of Electrical and Computer Engineering, University of Florida, Gainesville, Florida 32601", "University of Florida", "University of Florida", "University of Florida, Southwest 16th Avenue, Diamond Village Apartments, City of Gainesville Municipal Boundaries, Alachua County, Florida, 32611, USA", "29.63287840", "-82.34901330", "edu", "", "United States", "2017"], ["HARRISON: A Benchmark on HAshtag Recommendation for Real-world Images in Social Networks", "", "KAIST", "KAIST", "291 Daehak-ro, Eoeun-dong, Yuseong-gu, Daejeon, South Korea", "36.37214270", "127.36039000", "edu", "", "South Korea", "2016"], ["Goal Driven Detection in Natural Scenes Anonymous", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2018"], ["Neural ranking for automatic image annotation", "Zhejiang Future Technology Institute, Jiaxing, China", "Zhejiang Future Technology Institute, Jiaxing, China", "Zhejiang Future Technology Institute, Jiaxing, China", "China, Zhejiang, Ningbo, Yinzhou, Fangguang Rd, \u8054\u4e30\u7acb\u4ea4\u6865\u5357", "29.86538900", "121.50867200", "company", "", "China", "2018"], ["TVQA: Localized, Compositional Video Question Answering", "", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill, East Cameron Avenue, Chapel Hill, Orange County, North Carolina, 27514, USA", "35.91139710", "-79.05045290", "edu", "", "United States", "2018"], ["Multi-modal Factorized Bilinear Pooling with Co-attention Learning for Visual Question Answering", "", "University of Sydney", "University of Sydney", "USyd, Fisher Road, Camperdown, Sydney, NSW, 2006, Australia", "-33.88890695", "151.18943366", "edu", "", "Australia", "2017"], ["A Reinforcement Learning Framework for Natural Question Generation using Bi-discriminators", "", "Fudan University", "Fudan University", "\u590d\u65e6\u5927\u5b66, 220, \u90af\u90f8\u8def, \u4e94\u89d2\u573a\u8857\u9053, \u6768\u6d66\u533a, \u4e0a\u6d77\u5e02, 200433, \u4e2d\u56fd", "31.30104395", "121.50045497", "edu", "", "China", "2018"], ["PipeLayer: A Pipelined ReRAM-Based Accelerator for Deep Learning", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2017"], ["Interactive Text2Pickup Networks for Natural Language-Based Human\u2013Robot Collaboration", "Kakao Brain, Seongnam, South Korea", "Kakao Brain, Seongnam, South Korea", "Kakao Brain, Seongnam, South Korea", "South Korea, Gyeonggi-do, Seongnam-si, Bundang-gu, \uc0bc\ud3c9\ub3d9 685 11\uce35", "37.40269360", "127.10740820", "company", "", "South Korea", "2018"], ["FOIL it! Find One mismatch between Image and Language caption", "", "University of Trento", "University of Trento", "University of Trento, Via Giuseppe Verdi, Piedicastello, Trento, Territorio Val d'Adige, TN, TAA, 38122, Italia", "46.06588360", "11.11598940", "edu", "", "Italy", "2017"], ["Stacking With Auxiliary Features : Improved Ensembling for Natural Language and Vision", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2016"], ["What is the Role of Recurrent Neural Networks (RNNs) in an Image Caption Generator?", "", "University of Malta", "University of Malta", "University of Malta, Ring Road, Japanese Garden, L-Imsida, Malta, MSD 9027, Malta", "35.90232260", "14.48341890", "edu", "", "Malta", "2017"], ["Answering Image Riddles using Vision and Reasoning through Probabilistic Soft Logic", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2016"], ["Visual Question Answering : Datasets , Methods , Challenges and Oppurtunities", "", "Princeton University", "Princeton University", "Lot 9, University Place, Princeton Township, Mercer County, New Jersey, 08540, USA", "40.34829285", "-74.66308325", "edu", "", "United States", "2018"], ["An Analysis of Visual Question Answering Algorithms", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2017"], ["Best of Both Worlds: Transferring Knowledge from Discriminative Learning to a Generative Visual Dialog Model", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Automatic Understanding of Image and Video Advertisements", "", "University of Pittsburgh", "University of Pittsburgh", "University of Pittsburgh, Sutherland Drive, West Oakland, PGH, Allegheny County, Pennsylvania, 15240, USA", "40.44415295", "-79.96243993", "edu", "", "United States", "2017"], ["MemexQA: Visual Memex Question Answering", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Visual Relationship Detection Based on Local Feature and Context Feature", "Beijing University of Posts and Telecommunications, Beijing, 100876, China", "Beijing University of Posts and Telecommunications", "Beijing University of Posts and Telecommunications", "\u5317\u4eac\u90ae\u7535\u5927\u5b66, \u897f\u571f\u57ce\u8def, \u6d77\u6dc0\u533a, \u5317\u4eac\u5e02, 100082, \u4e2d\u56fd", "39.96014880", "116.35193921", "edu", "", "China", "2018"], ["A Restricted Visual Turing Test for Deep Scene and Event Understanding", "", "University of California, Los Angeles", "University of California, Los Angeles", "200 UCLA, Medical Plaza Driveway Suite 540, Los Angeles, CA 90095, USA", "34.06877880", "-118.44500940", "edu", "", "United States", "2015"], ["Unsupervised Visual Sense Disambiguation for Verbs using Multimodal Embeddings", "", "University of Edinburgh", "University of Edinburgh", "New College, New College Courtyard, The Mound, Old Town, Edinburgh, City of Edinburgh, Scotland, EH1 2LX, UK", "55.94951105", "-3.19534913", "edu", "", "United Kingdom", "2016"], ["Exploring Human-Like Attention Supervision in Visual Question Answering", "", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2018"], ["Sketch Recognition with Deep Visual-Sequential Fusion Model", "", "Fudan University", "Fudan University", "\u590d\u65e6\u5927\u5b66, 220, \u90af\u90f8\u8def, \u4e94\u89d2\u573a\u8857\u9053, \u6768\u6d66\u533a, \u4e0a\u6d77\u5e02, 200433, \u4e2d\u56fd", "31.30104395", "121.50045497", "edu", "", "China", "2017"], ["Mapping Instructions and Visual Observations to Actions with Reinforcement Learning", "", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2017"], ["Automatic Generation of Grounded Visual Questions", "", "Tianjin University", "Tianjin University", "\u6cf0\u5c71\u822a\u7a7a\u6e2f/\u5929\u6d25\u5927\u53a6, \u67a3\u884c\u8def, \u67a3\u884c \u9ad8\u738b\u5bfa, \u957f\u57ce\u8def, \u5927\u6cb3, \u5cb1\u5cb3\u533a (Daiyue), \u6cf0\u5b89\u5e02, \u5c71\u4e1c\u7701, 271000, \u4e2d\u56fd", "36.20304395", "117.05842113", "edu", "", "China", "2017"], ["NTUA-SLP at SemEval-2018 Task 1: Predicting Affective Content in Tweets with Deep Attentive RNNs and Transfer Learning", "", "National Technical University of Athens", "National Technical University of Athens", "\u0395\u03b8\u03bd\u03b9\u03ba\u03cc \u039c\u03b5\u03c4\u03c3\u03cc\u03b2\u03b9\u03bf \u03a0\u03bf\u03bb\u03c5\u03c4\u03b5\u03c7\u03bd\u03b5\u03af\u03bf, \u03a3\u03c4\u03bf\u03c5\u03c1\u03bd\u03ac\u03c1\u03b7, \u039c\u03bf\u03c5\u03c3\u03b5\u03af\u03bf, \u0391\u03b8\u03ae\u03bd\u03b1, \u0394\u03ae\u03bc\u03bf\u03c2 \u0391\u03b8\u03b7\u03bd\u03b1\u03af\u03c9\u03bd, \u03a0.\u0395. \u039a\u03b5\u03bd\u03c4\u03c1\u03b9\u03ba\u03bf\u03cd \u03a4\u03bf\u03bc\u03ad\u03b1 \u0391\u03b8\u03b7\u03bd\u03ce\u03bd, \u03a0\u03b5\u03c1\u03b9\u03c6\u03ad\u03c1\u03b5\u03b9\u03b1 \u0391\u03c4\u03c4\u03b9\u03ba\u03ae\u03c2, \u0391\u03c4\u03c4\u03b9\u03ba\u03ae, 11250, \u0395\u03bb\u03bb\u03ac\u03b4\u03b1", "37.98782705", "23.73179733", "edu", "", "Greece", "2018"], ["Reasoning About Fine-Grained Attribute Phrases Using Reference Games", "", "University of Massachusetts", "University of Massachusetts", "University of Massachusetts, Hicks Way, Amherst, Hampshire, Massachusetts, 01003, USA", "42.38897850", "-72.52869870", "edu", "", "United States", "2017"], ["Image Surveillance Assistant Architecture : Status and Planned Extensions", "", "Naval Research Laboratory", "Naval Research Laboratory", "Naval Research Laboratory Post Office, 4555, Overlook Avenue Southwest, Washington, D.C., 20375, USA", "38.82313810", "-77.01789020", "mil", "", "United States", "2016"], ["Predicting Motivations of Actions by Leveraging Text", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2016"], ["Towards Understanding End-of-trip Instructions in a Taxi Ride Scenario", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Scene-Centric Joint Parsing of Cross-View Videos", "", "University of California", "University of California", "Berkeley, CA, USA", "37.87189920", "-122.25853990", "edu", "", "United States", "2018"], ["Visual Question Answering as a Meta Learning Task", "", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2018"], ["CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Goal-Oriented Visual Question Generation via Intermediate Rewards", "", "Nanjing University", "Nanjing University", "NJU, \u4e09\u6c5f\u8def, \u9f13\u697c\u533a, \u5357\u4eac\u5e02, \u6c5f\u82cf\u7701, 210093, \u4e2d\u56fd", "32.05659570", "118.77408833", "edu", "", "China", "2018"], ["Video Visual Relation Detection", "", "Singapore", "Singapore", "Singapore", "1.35208300", "103.81983600", "edu", "", "Singapore", "2017"], ["A Corpus for Reasoning About Natural Language Grounded in Photographs", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2018"], ["Explaining the Unexplained: A CLass-Enhanced Attentive Response (CLEAR) Approach to Understanding Deep Neural Networks", "", "University of Waterloo", "University of Waterloo", "University of Waterloo, 200, University Avenue West, Northdale, Beechwood, Waterloo, Regional Municipality of Waterloo, Ontario, N2L 3G1, Canada", "43.47061295", "-80.54724732", "edu", "", "Canada", "2017"], ["Cascaded Mutual Modulation for Visual Reasoning", "", "University of Chinese Academy of Sciences", "University of Chinese Academy of Sciences", "University of Chinese Academy of Sciences, UCAS, Yuquanlu, \u7389\u6cc9\u8def, \u7530\u6751, \u6d77\u6dc0\u533a, 100049, \u4e2d\u56fd", "39.90828040", "116.24585270", "edu", "", "China", "2018"], ["Leveraging Visual Question Answering for Image-Caption Ranking", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["How clever is the FiLM model, and how clever can it be?", "", "University of Cambridge", "University of Cambridge", "Clifford Allbutt Lecture Theatre, Robinson Way, Romsey, Cambridge, Cambridgeshire, East of England, England, CB2 0QH, UK", "52.17638955", "0.14308882", "edu", "", "United Kingdom", "2018"], ["Going Deeper with Semantics : Video Activity Interpretation using Semantic Contextualization", "", "University of South Florida", "University of South Florida", "University of South Florida, Leroy Collins Boulevard, Tampa, Hillsborough County, Florida, 33620, USA", "28.05999990", "-82.41383619", "edu", "", "United States", "2018"], ["Mind Your Language: Learning Visually Grounded Dialog in a Multi-Agent Setting", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2018"], ["Overview of ImageCLEF 2018 Medical Domain Visual Question Answering Task", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2018"], ["The Promise of Premise: Harnessing Question Premises in Visual Question Answering", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Combining Multiple Cues for Visual Madlibs Question Answering", "University of North Carolina at Chapel Hill, Chapel Hill, USA", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill, East Cameron Avenue, Chapel Hill, Orange County, North Carolina, 27514, USA", "35.91139710", "-79.05045290", "edu", "", "United States", "2018"], ["Jointly Discovering Visual Objects and Spoken Words from Raw Sensory Input", "", "MIT", "Massachusetts Institute", "MIT, Amherst Street, Cambridgeport, Cambridge, Middlesex County, Massachusetts, 02238, USA", "42.35839610", "-71.09567788", "edu", "", "United States", "2018"], ["Resolving Language and Vision Ambiguities Together: Joint Segmentation & Prepositional Attachment Resolution in Captioned Scenes", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2016"], ["Pay Attention to Those Sets! Learning Quantification from Images", "", "University of Barcelona", "University of Barcelona", "Universitat de Barcelona, Carrer de la Diputaci\u00f3, l'Antiga Esquerra de l'Eixample, Eixample, Barcelona, BCN, CAT, 08013, Espa\u00f1a", "41.38689130", "2.16352385", "edu", "", "Spain", "2017"], ["Augmenting Image Question Answering Dataset by Exploiting Image Captions", "", "University of Tokyo", "University of Tokyo", "\u6771\u4eac\u5927\u5b66 \u67cf\u30ad\u30e3\u30f3\u30d1\u30b9, \u5b66\u878d\u5408\u306e\u9053, \u67cf\u5e02, \u5343\u8449\u770c, \u95a2\u6771\u5730\u65b9, 277-8583, \u65e5\u672c", "35.90204480", "139.93622009", "edu", "", "Japan", "2018"], ["Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["Weakly-Supervised Visual Grounding of Phrases with Linguistic Structures", "", "University of California, Davis", "University of California, Davis", "University of California, Davis, Apiary Drive, Yolo County, California, 95616-5270, USA", "38.53363490", "-121.79077264", "edu", "", "United States", "2017"], ["Discriminative Bimodal Networks for Visual Localization and Detection with Natural Language Queries", "", "University of Michigan", "University of Michigan", "University of Michigan, 500, Hayward Street, Ann Arbor, Washtenaw County, Michigan, 48109, USA", "42.29421420", "-83.71003894", "edu", "", "United States", "2017"], ["Computer Vision and Natural Language Processing: Recent Approaches in Multimedia and Robotics", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2016"], ["Riding Role Agent Vehicle Place Role Agent Vehicle Place Value Man Horse outside Value Dog Skateboard", "", "Chinese University of Hong Kong", "Chinese University of Hong Kong", "Hong Kong, \u99ac\u6599\u6c34\u6c60\u65c1\u8def", "22.41626320", "114.21093180", "edu", "", "China", "2017"], ["Hierarchical Question-Image Co-Attention for Visual Question Answering", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Counting Everyday Objects in Everyday Scenes", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Compositional Attention Networks for Machine Reasoning", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2018"], ["Tell-and-Answer: Towards Explainable Visual Question Answering using Attributes and Captions", "", "University of Rochester", "University of Rochester", "Memorial Art Gallery, 500, University Avenue, East End, Rochester, Monroe County, New York, 14607, USA", "43.15769690", "-77.58829158", "edu", "", "United States", "2018"], ["The Neural Painter: Multi-Turn Image Generation", "", "Cornell University", "Cornell University", "Cornell University, Forest Home Drive, Forest Home, Tompkins County, New York, 14853, USA", "42.45055070", "-76.47835130", "edu", "", "United States", "2018"], ["Connecting Language and Vision to Actions", "", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2018"], ["Dual Learning for Visual Question Generation", "Qualcomm Technologies, Inc., San Diego, CA", "Qualcomm Technologies, Inc., San Diego, CA", "Qualcomm Technologies, Inc., San Diego, CA", "5775 Morehouse Dr, San Diego, CA 92121, USA", "32.89607560", "-117.19559840", "company", "", "United States", "2018"], ["It Takes Two to Tango: Towards Theory of AI's Mind", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["The VQA-Machine: Learning How to Use Existing Vision Algorithms to Answer New Questions", "", "University of Adelaide", "University of Adelaide", "University of Adelaide, North Terrace, Adelaide, 5000, City of Adelaide, South Australia, 5000, Australia", "-34.91892260", "138.60423668", "edu", "", "Australia", "2017"], ["High-Order Attention Models for Visual Question Answering", "", "Technion", "Technion", "Haifa, 3200003, Israel", "32.77677830", "35.02312710", "edu", "", "Israel", "2017"], ["Vision as an Interlingua: Learning Multilingual Semantic Embeddings of Untranscribed Speech", "", "MIT", "Massachusetts Institute", "MIT, Amherst Street, Cambridgeport, Cambridge, Middlesex County, Massachusetts, 02238, USA", "42.35839610", "-71.09567788", "edu", "", "United States", "2018"], ["Understanding Representations and Reducing their Redundancy in Deep Networks", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Dynamic Neural Turing Machine with Soft and Hard Addressing Schemes.", "", "New York University", "New York University", "NYU, West 4th Street, NoHo Historic District, NoHo, Manhattan, Manhattan Community Board 2, New York County, NYC, New York, 10012, USA", "40.72925325", "-73.99625394", "edu", "", "United States", "2016"], ["Improving Context Modelling in Multimodal Dialogue Generation", "", "Heriot-Watt University", "Heriot-Watt University", "Heriot-Watt University - Edinburgh Campus, Third Gait, Currie, Gogarbank, City of Edinburgh, Scotland, EH14 4AS, UK", "55.91029135", "-3.32345777", "edu", "", "United Kingdom", "2018"], ["Transfer Learning via Unsupervised Task Discovery for Visual Question Answering", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2018"], ["Visual Coreference Resolution in Visual Dialog Using Neural Module Networks", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2018"], ["IQA: Visual Question Answering in Interactive Environments", "", "University of Washington", "University of Washington", "University of Washington, Rainier Vista, Montlake, University District, Seattle, King County, Washington, 98195, USA", "47.65432380", "-122.30800894", "edu", "", "United States", "2017"], ["Answer-Aware Attention on Grounded Question Answering in Images", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Embodied Question Answering", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["LiveBot: Generating Live Video Comments Based on Visual and Textual Contexts", "", "Microsoft Research Asia", "Microsoft Research Asia", "1 Memorial Dr, Cambridge, MA 02142, USA", "42.36142560", "-71.08120920", "company", "", "United States", "2018"], ["TGIF: A New Dataset and Benchmark on Animated GIF Description", "", "University of Rochester", "University of Rochester", "Memorial Art Gallery, 500, University Avenue, East End, Rochester, Monroe County, New York, 14607, USA", "43.15769690", "-77.58829158", "edu", "", "United States", "2016"], ["Learning Like a Child: Fast Novel Visual Concept Learning from Sentence Descriptions of Images", "", "University of California, Los Angeles", "University of California, Los Angeles", "200 UCLA, Medical Plaza Driveway Suite 540, Los Angeles, CA 90095, USA", "34.06877880", "-118.44500940", "edu", "", "United States", "2015"]]}