1
|
{"id": "35b0331dfcd2897abd5749b49ff5e2b8ba0f7a62", "paper": {"paperId": "35b0331dfcd2897abd5749b49ff5e2b8ba0f7a62", "key": "coco_qa", "title": "Exploring Models and Data for Image Question Answering", "journal": "Unknown", "address": "", "country": "", "address_type": "", "lat": "", "lng": "", "pdf_link": "https://arxiv.org/pdf/1505.02074.pdf", "report_link": "papers/35b0331dfcd2897abd5749b49ff5e2b8ba0f7a62.html", "citation_count": 191, "citations_geocoded": 116, "citations_unknown": 75, "citations_empty": 12, "citations_pdf": 165, "citations_doi": 27, "name": "COCO QA"}, "address": null, "citations": [["Automatic Understanding of Image and Video Advertisements", "", "University of Pittsburgh", "University of Pittsburgh", "University of Pittsburgh, Sutherland Drive, West Oakland, PGH, Allegheny County, Pennsylvania, 15240, USA", "40.44415295", "-79.96243993", "edu", "", "United States", "2017"], ["Tell-and-Answer: Towards Explainable Visual Question Answering using Attributes and Captions", "", "University of Rochester", "University of Rochester", "Memorial Art Gallery, 500, University Avenue, East End, Rochester, Monroe County, New York, 14607, USA", "43.15769690", "-77.58829158", "edu", "", "United States", "2018"], ["Counting Everyday Objects in Everyday Scenes", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Object Referring in Videos with Language and Human Gaze", "", "ETH Zurich", "ETH Zurich", "R\u00e4mistrasse 101, 8092 Z\u00fcrich, Switzerland", "47.37631300", "8.54766990", "edu", "", "Switzerland", "2018"], ["A Focused Dynamic Attention Model for Visual Question Answering", "", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2016"], ["Sherlock: Scalable Fact Learning in Images", "", "Rutgers University", "Rutgers University", "Rutgers Cook Campus - North, Biel Road, New Brunswick, Middlesex County, New Jersey, 08901, USA", "40.47913175", "-74.43168868", "edu", "", "United States", "2017"], ["Aligned Image-Word Representations Improve Inductive Transfer Across Vision-Language Tasks", "", "Google", "Google, Inc.", "1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA", "37.42199990", "-122.08405750", "company", "Google, Mountain View, CA", "United States", "2017"], ["On Human Motion Prediction Using Recurrent Neural Networks", "", "University of British Columbia", "University of British Columbia", "University of British Columbia, Eagles Drive, Hawthorn Place, University Endowment Lands, Metro Vancouver, British Columbia, V6T, Canada", "49.25839375", "-123.24658161", "edu", "", "Canada", "2017"], ["Question Answering under Instructor Guidance with Memory Networks", "", "Tsinghua University", "Tsinghua University", "\u6e05\u534e\u5927\u5b66, 30, \u53cc\u6e05\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100084, \u4e2d\u56fd", "40.00229045", "116.32098908", "edu", "", "China", ""], ["TVQA: Localized, Compositional Video Question Answering", "", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill, East Cameron Avenue, Chapel Hill, Orange County, North Carolina, 27514, USA", "35.91139710", "-79.05045290", "edu", "", "United States", "2018"], ["ABC-CNN: An Attention Based Convolutional Neural Network for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2015"], ["Supplementary Material : Cross-Dataset Adaptation for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Don't Just Assume; Look and Answer: Overcoming Priors for Visual Question Answering", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Understanding the Message of Images with Knowledge Base Traversals", "University of Mannheim, Mannheim, Germany", "University of Mannheim", "University of Mannheim, Mannheim, Germany", "68131 Mannheim, Germany", "49.48371060", "8.46223330", "edu", "", "Germany", "2016"], ["Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["Sketch Recognition with Deep Visual-Sequential Fusion Model", "", "Fudan University", "Fudan University", "\u590d\u65e6\u5927\u5b66, 220, \u90af\u90f8\u8def, \u4e94\u89d2\u573a\u8857\u9053, \u6768\u6d66\u533a, \u4e0a\u6d77\u5e02, 200433, \u4e2d\u56fd", "31.30104395", "121.50045497", "edu", "", "China", "2017"], ["Learning Answer Embeddings for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Best of Both Worlds: Transferring Knowledge from Discriminative Learning to a Generative Visual Dialog Model", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Visual Question Answering using Explicit Visual Attention", "Aristotle University of Thessaloniki, Dept. of Informatics, Thessaloniki, 54124,Greece", "Aristotle University of Thessaloniki", "Aristotle University of Thessaloniki", "\u0391\u03c1\u03b9\u03c3\u03c4\u03bf\u03c4\u03ad\u03bb\u03b5\u03b9\u03bf \u03a0\u03b1\u03bd\u03b5\u03c0\u03b9\u03c3\u03c4\u03ae\u03bc\u03b9\u03bf \u0398\u03b5\u03c3\u03c3\u03b1\u03bb\u03bf\u03bd\u03af\u03ba\u03b7\u03c2, \u0395\u03b3\u03bd\u03b1\u03c4\u03af\u03b1, \u03a3\u03b1\u03c1\u03ac\u03bd\u03c4\u03b1 \u0395\u03ba\u03ba\u03bb\u03b7\u03c3\u03af\u03b5\u03c2, \u0395\u03c5\u03b1\u03b3\u03b3\u03b5\u03bb\u03af\u03c3\u03c4\u03c1\u03b9\u03b1, \u0398\u03b5\u03c3\u03c3\u03b1\u03bb\u03bf\u03bd\u03af\u03ba\u03b7, \u0394\u03ae\u03bc\u03bf\u03c2 \u0398\u03b5\u03c3\u03c3\u03b1\u03bb\u03bf\u03bd\u03af\u03ba\u03b7\u03c2, \u03a0\u03b5\u03c1\u03b9\u03c6\u03b5\u03c1\u03b5\u03b9\u03b1\u03ba\u03ae \u0395\u03bd\u03cc\u03c4\u03b7\u03c4\u03b1 \u0398\u03b5\u03c3\u03c3\u03b1\u03bb\u03bf\u03bd\u03af\u03ba\u03b7\u03c2, \u03a0\u03b5\u03c1\u03b9\u03c6\u03ad\u03c1\u03b5\u03b9\u03b1 \u039a\u03b5\u03bd\u03c4\u03c1\u03b9\u03ba\u03ae\u03c2 \u039c\u03b1\u03ba\u03b5\u03b4\u03bf\u03bd\u03af\u03b1\u03c2, \u039c\u03b1\u03ba\u03b5\u03b4\u03bf\u03bd\u03af\u03b1 - \u0398\u03c1\u03ac\u03ba\u03b7, 54124, \u0395\u03bb\u03bb\u03ac\u03b4\u03b1", "40.62984145", "22.95889350", "edu", "", "Greece", "2018"], ["Simple and effective visual question answering in a single modality", "Zhejiang University, College of Computer Science, Hangzhou, P. R. China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2016"], ["Differential Attention for Visual Question Answering", "", "IIT Kanpur", "IIT Kanpur", "Kalyanpur, Kanpur, Uttar Pradesh 208016, India", "26.51233880", "80.23290000", "edu", "", "India", "2018"], ["Grad-CAM: Visual Explanations from Deep Networks via Gradient-Based Localization", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["Neural Module Networks", "", "University of California, Berkeley", "University of California, Berkeley", "Berkeley Art Museum and Pacific Film Archive, Bancroft Way, Southside, Berkeley, Alameda County, California, 94720-1076, USA", "37.86871260", "-122.25586815", "edu", "", "United States", "2016"], ["Object-Difference Attention: A Simple Relational Attention for Visual Question Answering", "Beijing University of Posts and Telecommunications, Beijing, China", "Beijing University of Posts and Telecommunications", "Beijing University of Posts and Telecommunications", "\u5317\u4eac\u90ae\u7535\u5927\u5b66, \u897f\u571f\u57ce\u8def, \u6d77\u6dc0\u533a, \u5317\u4eac\u5e02, 100082, \u4e2d\u56fd", "39.96014880", "116.35193921", "edu", "", "China", "2018"], ["Deep Attention Neural Tensor Network for Visual Question Answering", "", "Microsoft Research Asia", "Microsoft Research Asia", "1 Memorial Dr, Cambridge, MA 02142, USA", "42.36142560", "-71.08120920", "company", "", "United States", "2018"], ["Guest Editorial: Image and Language Understanding", "Google, Seattle, USA", "Google", "Google, Inc.", "1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA", "37.42199990", "-122.08405750", "company", "Google, Mountain View, CA", "United States", "2017"], ["Bidirectional Beam Search: Forward-Backward Inference in Neural Sequence Models for Fill-in-the-Blank Image Captioning", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["SCA-CNN: Spatial and Channel-Wise Attention in Convolutional Networks for Image Captioning", "", "National University of Singapore", "National University of Singapore", "NUS, Former 1936 British Outpost, Nepal Hill, Clementi, Southwest, 117542, Singapore", "1.29620180", "103.77689944", "edu", "", "Singapore", "2017"], ["CoDraw: Visual Dialog for Collaborative Drawing", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2017"], ["Diverse and Coherent Paragraph Generation from Images", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2018"], ["An Analysis of Visual Question Answering Algorithms", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2017"], ["Simple Baseline for Visual Question Answering", "", "MIT", "Massachusetts Institute", "MIT, Amherst Street, Cambridgeport, Cambridge, Middlesex County, Massachusetts, 02238, USA", "42.35839610", "-71.09567788", "edu", "", "United States", "2015"], ["Cross-Dataset Adaptation for Visual Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Examining Cooperation in Visual Dialog Models", "", "University of Amsterdam", "University of Amsterdam", "Institute for Logic, Language and Computation (ILLC), 107, Science Park, Oost-Watergraafsmeer, Amsterdam, Oost, Amsterdam, Noord-Holland, Nederland, 1098XG, Nederland", "52.35536550", "4.95016440", "edu", "", "Netherlands", "2017"], ["Task-driven Visual Saliency and Attention-based Visual Question Answering", "", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2017"], ["TGIF-QA: Toward Spatio-Temporal Reasoning in Visual Question Answering", "", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2017"], ["Explicit Reasoning over End-to-End Neural Architectures for Visual Question Answering", "", "Arizona State University", "Arizona State University", "Arizona State University Polytechnic campus, East Texas Avenue, Mesa, Maricopa County, Arizona, 85212, USA", "33.30715065", "-111.67653157", "edu", "", "United States", "2018"], ["MovieQA: Understanding Stories in Movies through Question-Answering", "", "Karlsruhe Institute of Technology", "Karlsruhe Institute of Technology", "KIT, Leopoldshafener Allee, Linkenheim, Linkenheim-Hochstetten, Landkreis Karlsruhe, Regierungsbezirk Karlsruhe, Baden-W\u00fcrttemberg, 76351, Deutschland", "49.10184375", "8.43312560", "edu", "", "Germany", "2016"], ["I Lead, You Help but Only with Enough Details: Understanding User Experience of Co-Creation with Artificial Intelligence", "Seoul National University, Suwon-si, Gyeonggi-do, Rebublic of Korea", "Seoul National University", "Seoul National University", "\uc11c\uc6b8\ub300\ud559\uad50, \uc11c\ud638\ub3d9\ub85c, \uc11c\ub454\ub3d9, \uad8c\uc120\uad6c, \uc218\uc6d0\uc2dc, \uacbd\uae30, 16614, \ub300\ud55c\ubbfc\uad6d", "37.26728000", "126.98411510", "edu", "", "South Korea", "2018"], ["Exploration on Grounded Word Embedding: Matching Words and Images with Image-Enhanced Skip-Gram Model", "", "Peking University", "Peking University", "\u5317\u4eac\u5927\u5b66, 5\u53f7, \u9890\u548c\u56ed\u8def, \u7a3b\u9999\u56ed\u5357\u793e\u533a, \u6d77\u6dc0\u533a, \u5317\u4eac\u5e02, 100871, \u4e2d\u56fd", "39.99223790", "116.30393816", "edu", "", "China", "2018"], ["TallyQA: Answering Complex Counting Questions", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2018"], ["Uncovering the Temporal Context for Video Question Answering", "SCS, Carnegie Mellon University, Pittsburgh, USA", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["FOIL it! Find One mismatch between Image and Language caption", "", "University of Trento", "University of Trento", "University of Trento, Via Giuseppe Verdi, Piedicastello, Trento, Territorio Val d'Adige, TN, TAA, 38122, Italia", "46.06588360", "11.11598940", "edu", "", "Italy", "2017"], ["Learning to Disambiguate by Asking Discriminative Questions", "", "Robotics Institute", "Robotics Institute", "Institute for Field Robotics, \u0e1b\u0e23\u0e30\u0e0a\u0e32\u0e2d\u0e38\u0e17\u0e34\u0e28, \u0e01\u0e23\u0e38\u0e07\u0e40\u0e17\u0e1e\u0e21\u0e2b\u0e32\u0e19\u0e04\u0e23, \u0e40\u0e02\u0e15\u0e23\u0e32\u0e29\u0e0e\u0e23\u0e4c\u0e1a\u0e39\u0e23\u0e13\u0e30, \u0e01\u0e23\u0e38\u0e07\u0e40\u0e17\u0e1e\u0e21\u0e2b\u0e32\u0e19\u0e04\u0e23, 10140, \u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\u0e22", "13.65450525", "100.49423171", "edu", "", "Thailand", "2017"], ["Proposal Incorporating Structural Bias into Neural Networks", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2017"], ["Learning Social Image Embedding with Deep Multimodal Attention Networks", "", "Beijing, China", "Beijing, China", "Beijing, China", "39.90419990", "116.40739630", "edu", "", "China", "2017"], ["Video Fill In the Blank Using LR/RL LSTMs with Spatial-Temporal Attentions", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2017"], ["Towards Transparent AI Systems: Interpreting Visual Question Answering Models", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["More Than An Answer: Neural Pivot Network for Visual Qestion Answering", "Xiamen University, Xiamen, China", "Xiamen University", "Xiamen University", "\u53a6\u95e8\u5927\u5b66, \u601d\u660e\u5357\u8def Siming South Road, \u601d\u660e\u533a, \u601d\u660e\u533a (Siming), \u53a6\u95e8\u5e02 / Xiamen, \u798f\u5efa\u7701, 361005, \u4e2d\u56fd", "24.43994190", "118.09301781", "edu", "", "China", "2017"], ["R-VQA: Learning Visual Relation Facts with Semantic Attention for Visual Question Answering", "", "Tsinghua University", "Tsinghua University", "\u6e05\u534e\u5927\u5b66, 30, \u53cc\u6e05\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100084, \u4e2d\u56fd", "40.00229045", "116.32098908", "edu", "", "China", "2018"], ["Answer-Type Prediction for Visual Question Answering", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2016"], ["Presentation Attack Detection for Cadaver Iris", "", "University of Notre Dame", "University of Notre Dame", "University of Notre Dame du Lac, Holy Cross Drive, Notre Dame, Maple Lane, Saint Joseph County, Indiana, 46556, USA", "41.70456775", "-86.23822026", "edu", "", "United States", "2018"], ["Knowledge Acquisition for Visual Question Answering via Iterative Querying", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2017"], ["VizWiz Grand Challenge: Answering Visual Questions from Blind People", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2018"], ["Active Learning for Visual Question Answering: An Empirical Study", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["Interpreting Visual Question Answering Models", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["A Reinforcement Learning Framework for Natural Question Generation using Bi-discriminators", "", "Fudan University", "Fudan University", "\u590d\u65e6\u5927\u5b66, 220, \u90af\u90f8\u8def, \u4e94\u89d2\u573a\u8857\u9053, \u6768\u6d66\u533a, \u4e0a\u6d77\u5e02, 200433, \u4e2d\u56fd", "31.30104395", "121.50045497", "edu", "", "China", "2018"], ["Leveraging Video Descriptions to Learn Video Question Answering", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Co-Attending Free-Form Regions and Detections With Multi-Modal Multiplicative Feature Embedding for Visual Question Answering", "", "East China Normal University", "East China Normal University", "\u534e\u4e1c\u5e08\u8303\u5927\u5b66, 3663, \u4e2d\u5c71\u5317\u8def, \u66f9\u5bb6\u6e21, \u666e\u9640\u533a, \u666e\u9640\u533a (Putuo), \u4e0a\u6d77\u5e02, 200062, \u4e2d\u56fd", "31.22849230", "121.40211389", "edu", "", "China", "2018"], ["Visual Question Answering with Question Representation Update (QRU)", "", "Chinese University of Hong Kong", "Chinese University of Hong Kong", "Hong Kong, \u99ac\u6599\u6c34\u6c60\u65c1\u8def", "22.41626320", "114.21093180", "edu", "", "China", "2016"], ["Neural Networks and Deep Learning", "IBM T. J. Watson Research Center, International Business Machines, Yorktown Heights, USA", "IBM Thomas J. Watson Research Center", "IBM Thomas J. Watson Research Center", "IBM Yorktown research lab, Adams Road, Millwood, Town of New Castle, Westchester County, New York, 10562, USA", "41.21002475", "-73.80407056", "company", "", "United States", "2018"], ["Stacked Attention Networks for Image Question Answering", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2016"], ["Zero-Shot Transfer VQA Dataset", "", "Baidu Research, USA", "Baidu Research, USA", "1195 Bordeaux Dr, Sunnyvale, CA 94089, USA", "37.40922650", "-122.02366150", "company", "", "United States", "2018"], ["VQS: Linking Segmentations to Questions and Answers for Supervised Attention in VQA and Question-Focused Semantic Segmentation", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2017"], ["Joint Image Captioning and Question Answering", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2018"], ["Video Fill in the Blank with Merging LSTMs", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2016"], ["Where to Look: Focus Regions for Visual Question Answering", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2016"], ["Grad-CAM: Why did you say that? Visual Explanations from Deep Networks via Gradient-based Localization", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Combining Multiple Cues for Visual Madlibs Question Answering", "University of North Carolina at Chapel Hill, Chapel Hill, USA", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill", "University of North Carolina at Chapel Hill, East Cameron Avenue, Chapel Hill, Orange County, North Carolina, 27514, USA", "35.91139710", "-79.05045290", "edu", "", "United States", "2018"], ["Visual7W: Grounded Question Answering in Images", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2016"], ["Mention Recommendation for Multimodal Microblog with Cross-attention Memory Network", "Shandong University, Jinan, China", "Shandong University", "Shandong University", "\u5c71\u4e1c\u5927\u5b66, \u6cf0\u5b89\u8857, \u9ccc\u5c71\u536b\u8857\u9053, \u5373\u58a8\u533a, \u9752\u5c9b\u5e02, \u5c71\u4e1c\u7701, 266200, \u4e2d\u56fd", "36.36934730", "120.67381800", "edu", "", "China", "2018"], ["DVQA: Understanding Data Visualizations via Question Answering", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2018"], ["Data Augmentation for Visual Question Answering", "", "Rochester Institute of Technology", "Rochester Institute of Technology", "Rochester Institute of Technology (RIT), 1, Lomb Memorial Drive, Bailey, Henrietta Town, Monroe County, New York, 14623, USA", "43.08250655", "-77.67121663", "edu", "", "United States", "2017"], ["Natural Language Video Description using Deep Recurrent Neural Networks", "", "University of Texas at Austin", "University of Texas at Austin", "University of Texas at Austin, 1, East 23rd Street, The Drag, Austin, Travis County, Texas, 78712, USA", "30.28415100", "-97.73195598", "edu", "", "United States", "2015"], ["Generative Adversarial Text to Image Synthesis", "", "Max Planck Institute for Informatics", "Max Planck Institute for Informatics", "MPII, E1 4, Campus, Universit\u00e4t, Sankt Johann, Bezirk Mitte, Saarbr\u00fccken, Regionalverband Saarbr\u00fccken, Saarland, 66123, Deutschland", "49.25795660", "7.04577417", "edu", "", "Germany", "2016"], ["Creativity: Generating Diverse Questions Using Variational Autoencoders", "", "Northwestern University", "Northwestern University", "Northwestern University, Northwestern Place, Downtown, Evanston, Cook County, Illinois, 60208, USA", "42.05511640", "-87.67581113", "edu", "", "United States", "2017"], ["Pay Attention to Those Sets! Learning Quantification from Images", "", "University of Barcelona", "University of Barcelona", "Universitat de Barcelona, Carrer de la Diputaci\u00f3, l'Antiga Esquerra de l'Eixample, Eixample, Barcelona, BCN, CAT, 08013, Espa\u00f1a", "41.38689130", "2.16352385", "edu", "", "Spain", "2017"], ["Deep Multimodal Learning: A Survey on Recent Advances and Trends", "University of Guelph, Guelph, Ontario, Canada", "University of Guelph", "University of Guelph, Guelph, Ontario, Canada", "University of Guelph, Guelph, ON N1G 1Y4, Canada", "43.52937320", "-80.22525020", "edu", "", "Canada", "2017"], ["Textually Enriched Neural Module Networks for Visual Question Answering", "", "Carnegie Mellon University", "Carnegie Mellon University Pittsburgh, PA - 15213, USA", "Carnegie Mellon University, Forbes Avenue, Squirrel Hill North, PGH, Allegheny County, Pennsylvania, 15213, USA", "40.44416190", "-79.94272826", "edu", "", "United States", "2018"], ["Being Negative but Constructively: Lessons Learnt from Creating Better Visual Question Answering Datasets", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Visual Dialog", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Explore Multi-Step Reasoning in Video Question Answering", "Tianjin University, Tianjin, China", "Tianjin University", "Tianjin University", "\u6cf0\u5c71\u822a\u7a7a\u6e2f/\u5929\u6d25\u5927\u53a6, \u67a3\u884c\u8def, \u67a3\u884c \u9ad8\u738b\u5bfa, \u957f\u57ce\u8def, \u5927\u6cb3, \u5cb1\u5cb3\u533a (Daiyue), \u6cf0\u5b89\u5e02, \u5c71\u4e1c\u7701, 271000, \u4e2d\u56fd", "36.20304395", "117.05842113", "edu", "", "China", "2018"], ["Multimodal Differential Network for Visual Question Generation", "", "Indian Institute of Technology Delhi", "IIIT-Delhi, India", "IIIT-Delhi, Mathura Road, Friends Colony, South East Delhi, Delhi, 110020, India", "28.54632595", "77.27325504", "edu", "", "India", "2018"], ["Motion-Appearance Co-Memory Networks for Video Question Answering", "", "University of Southern California", "University of Southern California", "University of Southern California, Watt Way, Saint James Park, LA, Los Angeles County, California, 90089, USA", "34.02241490", "-118.28634407", "edu", "", "United States", "2018"], ["Dual Attention Network for Visual Question Answering", "", "Boston University", "Boston University", "BU, Bay State Road, Fenway, Boston, Suffolk County, Massachusetts, 02215, USA", "42.35042530", "-71.10056114", "edu", "", "United States", "2017"], ["C-VQA: A Compositional Split of the Visual Question Answering (VQA) v1.0 Dataset", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2017"], ["Fast Parameter Adaptation for Few-shot Image Captioning and Visual Question Answering", "Zhejiang University, Zhejiang, China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2018"], ["Solving Visual Madlibs with Multiple Cues", "", "University of Illinois, Urbana-Champaign", "University of Illinois, Urbana-Champaign", "B-3, South Mathews Avenue, Urbana, Champaign County, Illinois, 61801, USA", "40.11116745", "-88.22587665", "edu", "", "United States", "2016"], ["Automatic Generation of Grounded Visual Questions", "", "Tianjin University", "Tianjin University", "\u6cf0\u5c71\u822a\u7a7a\u6e2f/\u5929\u6d25\u5927\u53a6, \u67a3\u884c\u8def, \u67a3\u884c \u9ad8\u738b\u5bfa, \u957f\u57ce\u8def, \u5927\u6cb3, \u5cb1\u5cb3\u533a (Daiyue), \u6cf0\u5b89\u5e02, \u5c71\u4e1c\u7701, 271000, \u4e2d\u56fd", "36.20304395", "117.05842113", "edu", "", "China", "2017"], ["Visual Storytelling", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Learning Cooperative Visual Dialog Agents with Deep Reinforcement Learning", "", "Georgia Institute of Technology", "Georgia Institute of Technology", "Georgia Tech, Atlantic Drive Northwest, Bellwood, Rockdale, Atlanta, Fulton County, Georgia, 30318, USA", "33.77603300", "-84.39884086", "edu", "", "United States", "2017"], ["Unifying the Video and Question Attentions for Open-Ended Video Question Answering", "State Key Laboratory of CAD&CG, Zhejiang University, Hangzhou, China", "Zhejiang University", "Zhejiang University", "\u6d59\u6c5f\u5927\u5b66\u4e4b\u6c5f\u6821\u533a, \u4e4b\u6c5f\u8def, \u8f6c\u5858\u8857\u9053, \u897f\u6e56\u533a (Xihu), \u676d\u5dde\u5e02 Hangzhou, \u6d59\u6c5f\u7701, 310008, \u4e2d\u56fd", "30.19331415", "120.11930822", "edu", "", "China", "2017"], ["PinterNet: A thematic label curation tool for large image datasets", "", "Northwestern University", "Northwestern University", "Northwestern University, Northwestern Place, Downtown, Evanston, Cook County, Illinois, 60208, USA", "42.05511640", "-87.67581113", "edu", "", "United States", "2016"], ["Hierarchical Question-Image Co-Attention for Visual Question Answering", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Visual Question Answering Dataset for Bilingual Image Understanding: A Study of Cross-Lingual Transfer Using Attention Maps", "", "Tokyo Institute of Technology", "Tokyo Institute of Technology", "\u6771\u4eac\u5de5\u696d\u5927\u5b66, \u539a\u6728\u8857\u9053, \u7dd1\u533a, \u753a\u7530\u5e02, \u795e\u5948\u5ddd\u770c, \u95a2\u6771\u5730\u65b9, 226-0026, \u65e5\u672c", "35.51675380", "139.48342251", "edu", "", "Japan", "2018"], ["Greedy Inference Algorithms for Structured and Neural Models", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2018"], ["JUST at VQA-Med: A VGG-Seq2Seq Model", "", "Jordan University of Science and Technology", "Jordan University of Science and Technology", "Jordan University of Science and Technology, \u0634\u0627\u0631\u0639 \u0627\u0644\u0623\u0631\u062f\u0646, \u0625\u0631\u0628\u062f\u200e, \u0625\u0631\u0628\u062f, \u0627\u0644\u0623\u0631\u062f\u0646", "32.49566485", "35.99160717", "edu", "", "Jordan", "2018"], ["Two-Stage Synthesis Networks for Transfer Learning in Machine Comprehension", "", "Microsoft", "Microsoft Corporation, Redmond, WA, USA", "One Microsoft Way, Redmond, WA 98052, USA", "47.64233180", "-122.13693020", "company", "", "United States", "2017"], ["From VQA to Multimodal CQA: Adapting Visual QA Models for Community QA Tasks", "", "Waseda University", "Waseda University", "\u65e9\u7a32\u7530\u5927\u5b66 \u5317\u4e5d\u5dde\u30ad\u30e3\u30f3\u30d1\u30b9, 2-2, \u6709\u6bdb\u5f15\u91ce\u7dda, \u516b\u5e61\u897f\u533a, \u5317\u4e5d\u5dde\u5e02, \u798f\u5ca1\u770c, \u4e5d\u5dde\u5730\u65b9, 808-0135, \u65e5\u672c", "33.88987280", "130.70856205", "edu", "", "Japan", "2018"], ["A Dataset and Exploration of Models for Understanding Video Data through Fill-in-the-Blank Question-Answering", "", "Polytechnique Montreal", "Polytechnique Montr\u00b4eal", "2900 Boulevard Edouard-Montpetit, Montr\u00e9al, QC H3T 1J4, Canada", "45.50438400", "-73.61288290", "edu", "Polytechnique Montreal, Montreal, Quebec, Canada", "Canada", "2017"], ["CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning", "", "Stanford University", "Stanford University", "Stanford University, Memorial Way, Stanford, Santa Clara County, California, 94305-6015, USA", "37.43131385", "-122.16936535", "edu", "", "United States", "2017"], ["Jointly Discovering Visual Objects and Spoken Words from Raw Sensory Input", "", "MIT", "Massachusetts Institute", "MIT, Amherst Street, Cambridgeport, Cambridge, Middlesex County, Massachusetts, 02238, USA", "42.35839610", "-71.09567788", "edu", "", "United States", "2018"], ["Structured Triplet Learning with POS-Tag Guided Attention for Visual Question Answering", "", "ETH Zurich", "ETH Zurich", "R\u00e4mistrasse 101, 8092 Z\u00fcrich, Switzerland", "47.37631300", "8.54766990", "edu", "", "Switzerland", "2018"], ["Anchors: High-Precision Model-Agnostic Explanations", "", "University of Washington", "University of Washington", "University of Washington, Rainier Vista, Montlake, University District, Seattle, King County, Washington, 98195, USA", "47.65432380", "-122.30800894", "edu", "", "United States", "2018"], ["Adversarial Geometry-Aware Human Motion Prediction", "", "Carnegie Mellon University Silicon Valley", "CARNEGIE MELLON UNIVERSITY", "Carnegie Mellon University Silicon Valley, South Akron Road, ARC, Santa Clara County, California, 94035-0016, USA", "37.41021930", "-122.05965487", "edu", "", "United States", "2018"], ["End-to-End Instance Segmentation and Counting with Recurrent Attention", "", "University of Toronto", "University of Toronto", "University of Toronto, St. George Street, Bloor Street Culture Corridor, Old Toronto, Toronto, Ontario, M5S 1A5, Canada", "43.66333345", "-79.39769975", "edu", "", "Canada", "2016"], ["Hierarchical Co-Attention for Visual Question Answering", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Neural Self Talk: Image Understanding via Continuous Questioning and Answering", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2015"], ["Adversarial Learning of Answer-Related Representation for Visual Question Answering", "Beihang University, Beijing, China", "Beihang University", "Beihang University", "\u5317\u4eac\u822a\u7a7a\u822a\u5929\u5927\u5b66, 37, \u5b66\u9662\u8def, \u4e94\u9053\u53e3, \u540e\u516b\u5bb6, \u6d77\u6dc0\u533a, 100083, \u4e2d\u56fd", "39.98083330", "116.34101249", "edu", "", "China", "2018"], ["Categorizing Concepts with Basic Level for Vision-to-Language", "", "Tongji University", "Tongji University", "\u540c\u6d4e\u5927\u5b66, 1239, \u56db\u5e73\u8def, \u6c5f\u6e7e, \u8679\u53e3\u533a, \u4e0a\u6d77\u5e02, 200092, \u4e2d\u56fd", "31.28473925", "121.49694909", "edu", "", "China", ""], ["Visual Text Correction", "", "University of Central Florida", "University of Central Florida", "University of Central Florida, Libra Drive, University Park, Orange County, Florida, 32816, USA", "28.59899755", "-81.19712501", "edu", "", "United States", "2018"], ["Comparatives, Quantifiers, Proportions: A Multi-Task Model for the Learning of Quantities from Vision", "", "University of Trento", "University of Trento", "University of Trento, Via Giuseppe Verdi, Piedicastello, Trento, Territorio Val d'Adige, TN, TAA, 38122, Italia", "46.06588360", "11.11598940", "edu", "", "Italy", "2018"], ["Computer Vision and Natural Language Processing: Recent Approaches in Multimedia and Robotics", "", "University of Maryland", "University of Maryland", "The Grand Garage, 5, North Paca Street, Seton Hill, Baltimore, Maryland, 21201, USA", "39.28996850", "-76.62196103", "edu", "", "United States", "2016"], ["The Color of the Cat is Gray: 1 Million Full-Sentences Visual Question Answering (FSVQA).", "", "University of Tokyo", "University of Tokyo", "\u6771\u4eac\u5927\u5b66 \u67cf\u30ad\u30e3\u30f3\u30d1\u30b9, \u5b66\u878d\u5408\u306e\u9053, \u67cf\u5e02, \u5343\u8449\u770c, \u95a2\u6771\u5730\u65b9, 277-8583, \u65e5\u672c", "35.90204480", "139.93622009", "edu", "", "Japan", "2016"], ["Leveraging Visual Question Answering for Image-Caption Ranking", "", "Virginia Tech", "Virginia Tech", "Blacksburg, VA 24061, USA", "37.22838430", "-80.42341670", "edu", "", "United States", "2016"], ["Multi-Networks Joint Learning for Large-Scale Cross-Modal Retrieval", "University of Texas at San Antonio, San Antonio, TX, USA", "University of Texas at San Antonio", "University of Texas at San Antonio", "UTSA, Paseo Principal, San Antonio, Bexar County, Texas, 78249-1620, USA", "29.58333105", "-98.61944505", "edu", "", "United States", "2017"]]}
|