This commit is contained in:
Johannes Paehr
2025-10-18 15:35:31 +02:00
commit c4354c0441
1352 changed files with 1821051 additions and 0 deletions

BIN
better-bibtex-search.sqlite Normal file

Binary file not shown.

BIN
better-bibtex.sqlite Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1 @@
{"name":"Better BibTeX","data":[],"idIndex":null,"binaryIndices":{"itemID":{"name":"itemID","dirty":true,"values":[]},"exportNotes":{"name":"exportNotes","dirty":true,"values":[]},"biblatexAPA":{"name":"biblatexAPA","dirty":true,"values":[]},"biblatexChicago":{"name":"biblatexChicago","dirty":true,"values":[]},"useJournalAbbreviation":{"name":"useJournalAbbreviation","dirty":true,"values":[]},"bibtexURL":{"name":"bibtexURL","dirty":true,"values":[]},"DOIandURL":{"name":"DOIandURL","dirty":true,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"Better BibTeX","dirty":true,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":false,"transactional":false,"cloneObjects":true,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}

View File

@@ -0,0 +1 @@
{"name":"Better BibTeX","data":[],"idIndex":null,"binaryIndices":{"itemID":{"name":"itemID","dirty":true,"values":[]},"exportNotes":{"name":"exportNotes","dirty":true,"values":[]},"biblatexAPA":{"name":"biblatexAPA","dirty":true,"values":[]},"biblatexChicago":{"name":"biblatexChicago","dirty":true,"values":[]},"useJournalAbbreviation":{"name":"useJournalAbbreviation","dirty":true,"values":[]},"bibtexURL":{"name":"bibtexURL","dirty":true,"values":[]},"DOIandURL":{"name":"DOIandURL","dirty":true,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"Better BibTeX","dirty":true,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":false,"transactional":false,"cloneObjects":true,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}

View File

@@ -0,0 +1 @@
{"name":"Better CSL JSON","data":[],"idIndex":null,"binaryIndices":{"itemID":{"name":"itemID","dirty":true,"values":[]},"exportNotes":{"name":"exportNotes","dirty":true,"values":[]},"biblatexAPA":{"name":"biblatexAPA","dirty":true,"values":[]},"biblatexChicago":{"name":"biblatexChicago","dirty":true,"values":[]},"useJournalAbbreviation":{"name":"useJournalAbbreviation","dirty":true,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"Better CSL JSON","dirty":true,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":false,"transactional":false,"cloneObjects":true,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}

View File

@@ -0,0 +1 @@
{"name":"Better CSL JSON","data":[],"idIndex":null,"binaryIndices":{"itemID":{"name":"itemID","dirty":true,"values":[]},"exportNotes":{"name":"exportNotes","dirty":true,"values":[]},"biblatexAPA":{"name":"biblatexAPA","dirty":true,"values":[]},"biblatexChicago":{"name":"biblatexChicago","dirty":true,"values":[]},"useJournalAbbreviation":{"name":"useJournalAbbreviation","dirty":true,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"Better CSL JSON","dirty":true,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":false,"transactional":false,"cloneObjects":true,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}

View File

@@ -0,0 +1 @@
{"name":"Better CSL YAML","data":[],"idIndex":null,"binaryIndices":{"itemID":{"name":"itemID","dirty":true,"values":[]},"exportNotes":{"name":"exportNotes","dirty":true,"values":[]},"biblatexAPA":{"name":"biblatexAPA","dirty":true,"values":[]},"biblatexChicago":{"name":"biblatexChicago","dirty":true,"values":[]},"useJournalAbbreviation":{"name":"useJournalAbbreviation","dirty":true,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"Better CSL YAML","dirty":true,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":false,"transactional":false,"cloneObjects":true,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}

View File

@@ -0,0 +1 @@
{"name":"Better CSL YAML","data":[],"idIndex":null,"binaryIndices":{"itemID":{"name":"itemID","dirty":true,"values":[]},"exportNotes":{"name":"exportNotes","dirty":true,"values":[]},"biblatexAPA":{"name":"biblatexAPA","dirty":true,"values":[]},"biblatexChicago":{"name":"biblatexChicago","dirty":true,"values":[]},"useJournalAbbreviation":{"name":"useJournalAbbreviation","dirty":true,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"Better CSL YAML","dirty":true,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":false,"transactional":false,"cloneObjects":true,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}

View File

@@ -0,0 +1 @@
{"name":"BetterBibTeX JSON","data":[],"idIndex":null,"binaryIndices":{"itemID":{"name":"itemID","dirty":true,"values":[]},"exportNotes":{"name":"exportNotes","dirty":true,"values":[]},"biblatexAPA":{"name":"biblatexAPA","dirty":true,"values":[]},"biblatexChicago":{"name":"biblatexChicago","dirty":true,"values":[]},"useJournalAbbreviation":{"name":"useJournalAbbreviation","dirty":true,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"BetterBibTeX JSON","dirty":true,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":false,"transactional":false,"cloneObjects":true,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}

View File

@@ -0,0 +1 @@
{"name":"BetterBibTeX JSON","data":[],"idIndex":null,"binaryIndices":{"itemID":{"name":"itemID","dirty":true,"values":[]},"exportNotes":{"name":"exportNotes","dirty":true,"values":[]},"biblatexAPA":{"name":"biblatexAPA","dirty":true,"values":[]},"biblatexChicago":{"name":"biblatexChicago","dirty":true,"values":[]},"useJournalAbbreviation":{"name":"useJournalAbbreviation","dirty":true,"values":[]}},"constraints":null,"uniqueNames":[],"transforms":{},"objType":"BetterBibTeX JSON","dirty":true,"cachedIndex":null,"cachedBinaryIndex":null,"cachedData":null,"adaptiveBinaryIndices":false,"transactional":false,"cloneObjects":true,"cloneMethod":"parse-stringify","asyncListeners":false,"disableMeta":false,"disableChangesApi":true,"disableDeltaChangesApi":true,"autoupdate":false,"serializableIndices":true,"disableFreeze":true,"ttl":null,"maxId":0,"DynamicViews":[],"events":{"insert":[],"update":[],"pre-insert":[],"pre-update":[],"close":[],"flushbuffer":[],"error":[],"delete":[null],"warning":[null]},"changes":[],"dirtyIds":[]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1
better-bibtex/cache.json Normal file
View File

@@ -0,0 +1 @@
{"filename":"cache","collections":["itemToExportFormat","Better BibLaTeX","Better BibTeX","Better CSL JSON","Better CSL YAML","BetterBibTeX JSON"],"databaseVersion":1.5,"engineVersion":1.5,"autosave":false,"autosaveInterval":5000,"autosaveHandle":null,"throttledSaves":false,"options":{"env":"NA","serializationMethod":"normal","destructureDelimiter":"$<\n"},"persistenceAdapter":null,"throttledSavePending":false,"throttledCallbacks":[],"verbose":false,"events":{"init":[null],"loaded":[],"flushChanges":[],"close":[],"changes":[],"warning":[]},"ENV":"NA"}

View File

@@ -0,0 +1 @@
{"filename":"cache","collections":["itemToExportFormat","Better BibLaTeX","Better BibTeX","Better CSL JSON","Better CSL YAML","BetterBibTeX JSON"],"databaseVersion":1.5,"engineVersion":1.5,"autosave":false,"autosaveInterval":5000,"autosaveHandle":null,"throttledSaves":false,"options":{"env":"NA","serializationMethod":"normal","destructureDelimiter":"$<\n"},"persistenceAdapter":null,"throttledSavePending":false,"throttledCallbacks":[],"verbose":false,"events":{"init":[null],"loaded":[],"flushChanges":[],"close":[],"changes":[],"warning":[]},"ENV":"NA"}

BIN
locate/CrossRef Lookup.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.8 KiB

31
locate/engines.json Normal file
View File

@@ -0,0 +1,31 @@
[
{
"_name": "CrossRef Lookup",
"_alias": "CrossRef",
"_description": "CrossRef Search Engine",
"_icon": "file:///C:/Users/Johannes/Zotero/locate/CrossRef%20Lookup.ico",
"_hidden": false,
"_urlTemplate": "https://crossref.org/openurl?{z:openURL}&pid=zter:zter321",
"_urlParams": [],
"_urlNamespaces": {
"z": "http://www.zotero.org/namespaces/openSearch#",
"": "http://a9.com/-/spec/opensearch/1.1/"
},
"_iconSourceURI": "https://crossref.org/favicon.ico"
},
{
"_name": "Google Scholar Search",
"_alias": "Google Scholar",
"_description": "Google Scholar Search",
"_icon": "file:///C:/Users/Johannes/Zotero/locate/Google%20Scholar%20Search.ico",
"_hidden": false,
"_urlTemplate": "https://scholar.google.com/scholar?as_q=&as_epq={z:title}&as_occt=title&as_sauthors={rft:aufirst?}+{rft:aulast?}&as_ylo={z:year?}&as_yhi={z:year?}&as_sdt=1.&as_sdtp=on&as_sdtf=&as_sdts=22&",
"_urlParams": [],
"_urlNamespaces": {
"rft": "info:ofi/fmt:kev:mtx:journal",
"z": "http://www.zotero.org/namespaces/openSearch#",
"": "http://a9.com/-/spec/opensearch/1.1/"
},
"_iconSourceURI": "https://scholar.google.com/favicon.ico"
}
]

View File

@@ -0,0 +1,74 @@
2021 IEEE International Symposium on Mixed and Augmented Reality Adjunct (ISMAR-Adjunct)
2021 IEEE International Symposium on Mixed and Augmented Reality Adjunct (ISMAR-Adjunct) | 978-1-6654-1298-8/21/$31.00 ©2021 IEEE | DOI: 10.1109/ISMAR-Adjunct54149.2021.00098
Multi-scale Mixed Reality Collaboration for Digital Twin
Hyung-il Kim*
KAIST UVR Lab
Taehei Kim†
Motion Computing Lab,
KAIST
Eunhwa Song‡
KAIST UVR Lab
Woontack Woo||
KAIST UVR Lab
Seo Young Oh§
KAIST UVR Lab
Dooyoung Kim¶
KAIST UVR Lab
ABSTRACT
In this poster, we present a digital twin-based mixed reality system for remote collaboration with the size-scaling of the user and the space. The proposed system supports collaboration between an AR host user and a VR remote user by sharing a 3D digital twin of the AR host user. To enhance the coarse authoring of a shared digital twin environment, we provide a size scaling of the digital twin environment with the world-in-miniature view. Also, we enable scaling the size of the VR users avatar to enhance both coarse (size-up) and fine-grained (size-down) authoring of the digital twin environment. We describe the system setup, input methods, and interaction methods for scaling space and user.
Index Terms: Human-centered computing—Human computer interaction (HCI)—Interaction paradigms—Mixed / augmented reality; Human-centered computing—Human computer interaction (HCI)—Interaction paradigms—Collaborative interaction
1 INTRODUCTION
Combining the advantage of both the Augmented Reality (AR) to interact with real world and Virtual Reality (VR) to provide immersive experience, Mixed Reality (MR) creates a more realistic virtual experience. While MR can be applied and utilized in various areas, it can be effective when assisting collaboration between two users at a distance. In this case, a local host user with an AR device will summon a remote VR user in his space, and the VR user has full access to the AR users space in virtual reality format. While collaborative system engaging either one of AR or VR users has been in high attention, collaboration in MR environment engaging both has been studied relatively little. With technical improvement of digital twin (DT) technology, MR collaboration can be made for a more immersive collaborative experience.
We propose a digital twin-based novel MR system for remote collaboration bridging the gap between AR and VR. Our work introduces size-scaling of two elements to enhance collaboration in a mixed reality environment; space and user. In real environment such as floor planning, interior design, and city planning, we face constraints; Moving and measuring massive objects in person (in our specific scenario eg. furniture) and seeing the space in a larger context. Approach using our multi-scale space solves this physical issue. Both the AR and VR user can observe and manipulate the AR users space, operating virtual objects in a miniature size. This reduces the burden of directly moving objects in a physical space.
Scaling the size of the space in our study was acquired from the concept of the World In Miniature (WIM) to see the whole space at
*e-mail: hyungil@kaist.ac.kr †e-mail: hayleyy321@kaist.ac.kr ‡e-mail: eunhwa99@kaist.ac.kr §e-mail: seoyoung.oh@kaist.ac.kr ¶e-mail: dooyoung.kim@kaist.ac.kr ||e-mail: wwoo@kaist.ac.kr
Figure 1: Our proposed collaborative interaction between an AR host user and a VR user. a) Normal-scale collaboration b) Multi-scale space with an miniature space c) Giant user d) Miniature user
one glance in VR, which was first introduced by Stoakey et al. [3]. More recent works replicated the real world for minute manipulation of objects in AR by creating a photo-realistic composite image to allow the user to see the space with a birds eye view in MR. [4, 5] Applied from these previous studies, our work uses the concept of WIM to manipulate the object in small scale and generates a digital twin scanned from the real environment for remote collaboration in MR.
The concept of scaling the size of the user in MR collaboration is explored in recent studies, by utilizing the miniaturized avatar [2], the giant avatar [1]. Our multi-scale user function allows the AR user to change the size of the VR user which enables the VR user to see the space from a different viewpoint. For the detailed manipulation in a small scale, the AR user changes the VR users size into miniature so that the VR user can author the AR object carefully. On the other hand, by enlarging the size of the VR user into giant, the VR user can view the whole environment at glance and visually comprehend the space, solving the difficulties of perceiving the relative locations and distances of surrounding objects in a life scale space.
2 SYSTEM DESIGN AND IMPLEMENTATION
2.1 Overview
Our proposed system is a multi-user mixed-reality system with a shared digital twin environment. A host user wears an AR headset and summons remote VR user to his own host space. A remote VR user wears a VR headset, and is summoned to the digital twin of the host space. Both the AR user and the VR user can see each other as a virtual avatar in their shared space. The AR user sees the remote VR users avatar in his own physical space, and the VR user sees the
978-1-6654-1298-8/21/$31.00 ©2021 IEEE
435
DOI 10.1109/ISMAR-Adjunct54149.2021.00098
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 12,2025 at 09:41:19 UTC from IEEE Xplore. Restrictions apply.
Figure 2: System diagram of proposed digital twin system
AR host users avatar in the virtual digital twin space.
2.2 Digital Twin Management
To enable multi-platform mixed reality collaboration using digital twin, we designed following digital twin system. (Figure 2) Our proposed digital twin system consists of host user data, remote user data, and shared space and objects data. For the AR host user data, we manage the 6DOF head pose and the 6DOF hand pose of both hands. For the remote VR user data, we manage the 6DOF head pose, 6DOF hand pose, and additional 1DOF scale data. Space data consists of the scanned space geometry, the scale of the miniature space, and the 6DOF poses of the objects. Every data in the digital twin is synchronized with both collaborators in real-time.
For real-time pose synchronization between real and virtual environment, finding local coordinate in relative to the real host environment is crucial. To find relative pose in sync with digital twin, the system need to detect and to track the host environment. Camerabased image marker detection, finding spatial anchor, or outside-in tracking can solve this problem.
2.3 Interaction Methods
Multi-scale Space Both the AR host user and the remote VR user can summon the miniature of the digital twin by tapping a miniature summon button. After summoning the miniature digital twin, they can place the miniature, or can manipulate the scale of the miniature in their space using their hands or controllers. (Figure 1b) Miniature space object has two modes - Placement mode and Viewing Mode. In the Placement mode, both users can manipulate the miniature in their shared virtual space and the users cant manipulate the objects inside the miniature. In the Viewing mode, the pose of the miniature is fixed in the environment, and the users can manipulate the virtual objects inside the miniature. The objects in the original digital twin and miniature are synchronized.
Multi-scale User Since the AR user cant manipulate his own size, we enable manipulation of the VR users size in the digital twin. (Figure 1c, 1d) To manipulate the scale of the VR user, we propose two modes - Collaboration mode and User Manipulation mode. In the Collaboration mode, the size of the VR user cant be adjusted and both users can manipulate their shared virtual objects. In the User Manipulation mode, both users can manipulate the size of the VR user. AR user can manipulate the remote VR users scale, and can place remote VR user to any location. The VR user can manipulate his own scale, and can navigate the digital twin space by walking or teleporting. When the position and the size of the VR user is manipulated, the VR user teleports to the designated position with specified size.
Collaboration Cues We share both users head and hand pose for the collaboration cues. Shared head and hand poses are reflected as an avatar in their real/virtual space. Users can know where the collaborator is looking at, which objects the collaborator is interacting with, or which gesture the collaborator is doing by sharing those cues. Also, our system shares both users audio to enable basic verbal communication.
2.4 Setup and Implementation
For the host user (AR), we used Microsoft HoloLens 2 for the AR headset. To scan the 3D model of the host environment, we used Apple iPhone 12 Pro. For the remote VR side, we used Oculus Quest 2 for the VR headset. Both devices are connected to the 5GHz wireless network.
For the software, we used Unity 2019.4.25f1 to develop a prototype interior design application for both AR and VR side. To enable basic manipulation and system input in MR, we used MRTK 2.6.1. To align the virtual digital twin and real host environment, we used Azure Spatial Anchor to find and locate the spatial anchor, then synchronized the coordinate system of the real and virtual environment. For networking, we used Photon Unity Networking to share both users 6DOF head pose, hand pose for the both hands, and the pose of shared virtual objects.
3 CONCLUSION
In this poster, we presented a digital twin-based mixed-reality remote collaboration system with multi-scale space and user. For the collaborative virtual object manipulation scenario like collaborative interior design, we proposed multi-scale space and multi-scale user. Using multi-scale space, we provide world-in-miniature of the host digital twin for quick and coarse manipulation of the shared virtual objects, and bird-eye view of their own shared space. By manipulating the size of the remote VR user, we can enhance both coarse and fine-grained authoring of the digital twin environment.
ACKNOWLEDGMENTS
This work was supported by Korea Institute for Advancement of Technology(KIAT) grant funded by the Korea Government(MOTIE) (P0012746, The Competency Development Program for Industry Specialist)
REFERENCES
[1] T. Piumsomboon, G. A. Lee, B. Ens, B. H. Thomas, and M. Billinghurst. Superman vs giant: A study on spatial perception for a multi-scale mixed reality flying telepresence interface. IEEE Transactions on Visualization and Computer Graphics, 24(11):29742982, 2018. doi: 10.1109/TVCG. 2018.2868594
[2] T. Piumsomboon, G. A. Lee, J. D. Hart, B. Ens, R. W. Lindeman, B. H. Thomas, and M. Billinghurst. Mini-me: An adaptive avatar for mixed reality remote collaboration. In Proceedings of the 2018 CHI Conference on Human Factors in Computing Systems, CHI 18, p. 113. Association for Computing Machinery, New York, NY, USA, 2018. doi: 10.1145/3173574.3173620
[3] R. Stoakley, M. J. Conway, and R. Pausch. Virtual reality on a wim: Interactive worlds in miniature. In Proceedings of the SIGCHI Conference on Human Factors in Computing Systems, CHI 95, p. 265272. ACM Press/Addison-Wesley Publishing Co., USA, 1995. doi: 10.1145/223904 .223938
[4] B. Thoravi Kumaravel, F. Anderson, G. Fitzmaurice, B. Hartmann, and T. Grossman. Loki: Facilitating remote instruction of physical tasks using bi-directional mixed-reality telepresence. In Proceedings of the 32nd Annual ACM Symposium on User Interface Software and Technology, UIST 19, p. 161174. Association for Computing Machinery, New York, NY, USA, 2019. doi: 10.1145/3332165.3347872
[5] Z. Wang, C. Nguyen, P. Asente, and J. Dorsey. DistanciAR: Authoring Site-Specific Augmented Reality Experiences for Remote Environments. Association for Computing Machinery, New York, NY, USA, 2021.
436 Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 12,2025 at 09:41:19 UTC from IEEE Xplore. Restrictions apply.

View File

@@ -0,0 +1,14 @@
Title: Multi-scale Mixed Reality Collaboration for Digital Twin
Subject: 2021 IEEE International Symposium on Mixed and Augmented Reality Adjunct (ISMAR-Adjunct);2021; ; ;10.1109/ISMAR-Adjunct54149.2021.00098
Author: Hyung-Il Kim
Producer: OpenPDF 1.0.0-SNAPSHOT; modified using iText® 7.1.1 ©2000-2018 iText Group NV (AGPL-version)
CreationDate: 10/25/21 17:05:14
ModDate: 11/02/21 11:28:02
Tagged: no
Form: none
Pages: 2
Encrypted: no
Page size: 612 x 792 pts (letter) (rotated 0 degrees)
File size: 1183133 bytes
Optimized: no
PDF version: 1.4

View File

View File

@@ -0,0 +1,74 @@
<html lang=en-US class="vnedblzgbi idc0_343"><!--
Page saved with SingleFile
url: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=183317
saved date: Thu Nov 30 2023 15:00:14 GMT+0100 (Mitteleuropäische Normalzeit)
--><meta charset=utf-8><title class=sf-hidden>IEEE Xplore Full-Text PDF: </title><meta name=parsely-type content=index class=sf-hidden><link type=image/x-icon rel="shortcut icon" href="data:image/vnd.microsoft.icon;base64,AAABAAEAEBAAAAAAAABoBQAAFgAAACgAAAAQAAAAIAAAAAEACAAAAAAAQAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAA////ALCKPgB9IwAA0K+vAIVLTACaZgIAQwICAFE6OgDBn24AqGlqAOPVuwCdYDEAgycoAGsAAAC9mZkAi0YAACUAAADTvZUAbjo6AJ9XTwCicx0Ae15eAJJANgC5lVUAtohrAJJkYwCXWBwAqHgyAK59TwC1p6cA2cG2AFcAAADNq5kAhTUEAHQTAADZxqUAFAAAAJBWAACtdV0Anl1dAGAICAA1AAAAjDQyALiNXwCXVCgAy6enAHk2NgCqeT4A07qgALOHUAC8mV8A07i4AG4yMwDfzbYAx5uZAHIGBADBp6cAgS0AAJZdAwAJAgIATAAAAIg+AwChZzUAz7OWAKuAMQDKpZ4AaAgJAIxPAABfAAAAGQQEANi/rwA7AwMAeR0AAIJdXQC2kVAALAAAAFAEBADWwKAAv56dAKhmZABbAwQAeBYEAKtvXgB7KQAAyaKZACgDBACWYgAAq302AGQCAgC4jWkAcgoAAJFRAgCyhVUAk1gEALWNUwDTupkAdRoAAIhCAADh0bgAml9fAIY7AACDNgAAlFoAAG0EAACJSgAAt5FVANm+uQDXwaQABgAAAAwAAABGAAAAcQUAAHMOAAB5IgAAmGMEAK55XQCTQDoAEAAAADkAAABbAAAAZwAAAIExAACNSgAAuZBdAKtraQDPsZkAeBgAAI5TAADbw7gAnF0yAN/MuQDSupUAdxQAAH4oAACHPwAAj04AAJdfAACTXQAAFwAAADYCAgBCAAAATwAAAGMAAAC3i2cAfiUCAJhlAACsc18ACQAAANjFowCMSQIA4tO7AOLTuABrAQIAcgcAALuZXQCCLwAAikMAAIhABACLTQAAll0AAI1RAACQUgAAklUAAJdgAgAnAAAANwAAAFkAAACJPwAAi0gAAJhlAgCOVgAAxpqZAMGgbQB8JAAAgjABAIQ3AACFPAAAmWQAAJdkAACZZQEAlmAAAIpJAACOTQAAkE8AAI5PAACPVQAAll8CAOPUuwDTu5kAz7KZAM2smQDJo5kAOAAAAFoAAABkAAAAaAAAALaRUQB+JAIAgjAAAIY8AACHQAAAmGYAAJllAACYZAAAl2MAAJZhAACWXwAAlV0AAJNcAACTWgAAkFMAAI5VAACYZAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAksvLy8yzV86J0NKjiBBlnLLMy8zNV87PoGejuBDIxwOSBrSq1XOkuzteXJaeIpGFyoq10bqAJtO2YodmciNbmrUCFUGbM1gcfCw/gidTdcSrCxgSAQG9YAEBvyEBAS6n1JhLla2iqTZaekkffXlDp4C8xSQJt8mDGa5xawpFKY6hvGqEAQG+fgEBN6wBATnBRLxfTjEdLUdCFA0ETwUTKp9jMmyQVGGBUMN4NBqmSExpl11AAQHAVQEBDw8BAR52uTAbDHSTFysoZC81ShYIboixsIYjaA7DID1vdxEllG17Pq/GUjiZWVFNB4xWRjwAnag6f3AOj8KOjSqli5QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA="><style>.sf-hidden{display:none!important}</style><link rel=canonical href="https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&amp;arnumber=183317"><meta http-equiv=content-security-policy content="default-src 'none'; font-src 'self' data:; img-src 'self' data:; style-src 'unsafe-inline'; media-src 'self' data:; script-src 'unsafe-inline' data:; object-src 'self' data:; frame-src 'self' data:;"><style>img[src="data:,"],source[src="data:,"]{display:none!important}</style><body><apm_do_not_touch>
</apm_do_not_touch>
<style class=sf-hidden>html{margin:0;padding:0;overflow:hidden}body{margin:0;padding:0}</style>

View File

@@ -0,0 +1,725 @@
sensors
Article
ARETT: Augmented Reality Eye Tracking Toolkit for Head Mounted Displays
Sebastian Kapp 1,* , Michael Barz 2,3 , Sergey Mukhametov 1 , Daniel Sonntag 2,3 and Jochen Kuhn 1
1 Department of Physics, Technische Universität Kaiserslautern, Erwin-Schrödinger-Str. 46, 67663 Kaiserslautern, Germany; mukhamet@physik.uni-kl.de (S.M.); kuhn@physik.uni-kl.de (J.K.)
2 German Research Center for Artificial Intelligence (DFKI), Interactive Machine Learning Department, Stuhlsatzenhausweg 3, Saarland Informatics Campus D3_2, 66123 Saarbrücken, Germany; michael.barz@dfki.de (M.B.); daniel.sonntag@dfki.de (D.S.)
3 Applied Artificial Intelligence, Oldenburg University, Marie-Curie Str. 1, 26129 Oldenburg, Germany * Correspondence: kapp@physik.uni-kl.de
Citation: Kapp, S.; Barz, M.; Mukhametov, S.; Sonntag, D.; Kuhn, J. ARETT: Augmented Reality Eye Tracking Toolkit for Head Mounted Displays. Sensors 2021, 21, 2234. https://doi.org/10.3390/s21062234
Academic Editor: Jamie A Ward
Received: 25 February 2021 Accepted: 17 March 2021 Published: 23 March 2021
Publishers Note: MDPI stays neutral with regard to jurisdictional claims in published maps and institutional affiliations.
Copyright: © 2021 by the authors. Licensee MDPI, Basel, Switzerland. This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution (CC BY) license (https:// creativecommons.org/licenses/by/ 4.0/).
Abstract: Currently an increasing number of head mounted displays (HMD) for virtual and augmented reality (VR/AR) are equipped with integrated eye trackers. Use cases of these integrated eye trackers include rendering optimization and gaze-based user interaction. In addition, visual attention in VR and AR is interesting for applied research based on eye tracking in cognitive or educational sciences for example. While some research toolkits for VR already exist, only a few target AR scenarios. In this work, we present an open-source eye tracking toolkit for reliable gaze data acquisition in AR based on Unity 3D and the Microsoft HoloLens 2, as well as an R package for seamless data analysis. Furthermore, we evaluate the spatial accuracy and precision of the integrated eye tracker for fixation targets with different distances and angles to the user (n = 21). On average, we found that gaze estimates are reported with an angular accuracy of 0.83 degrees and a precision of 0.27 degrees while the user is resting, which is on par with state-of-the-art mobile eye trackers.
Keywords: augmented reality; eye tracking; toolkit; accuracy; precision
1. Introduction Head mounted displays (HMD) got more affordable and lightweight in the last few
years facilitating a broader usage of virtual and augmented reality (VR/AR) applications. In addition, recent devices are equipped with integrated eye trackers which primarily target novel gaze-based interaction techniques [1,2] and optimizing the display quality, e.g., using foveated rendering [3,4]. This creates new opportunities for eye tracking research in mixed reality settings. However, the number and functionality of research tools for AR and VR eye tracking is still limited, e.g., compared to the well-established stationary eye trackers that are attached to a two-dimensional display. Available commercial solutions for HMD eye tracking are mostly limited to VR (see, e.g., References [5,6]). Pupil Labs [6] offers an extension for AR eye tracking which consists of mobile eye tracking equipment attached to an HMD, but with only a loose integration into AR application development tools.
In this work, we aim at closing the gap of research tools for AR eye tracking. We implement an open-source toolkit that facilitates eye tracking research in AR environments with the Microsoft HoloLens 2. Our toolkit includes a package for the Unity 3D game development engine which enables simple integration of reliable gaze and meta data recordings in AR applications, and an R package for seamless post-hoc processing and analysis of the data. In addition, we conduct a user study (n = 21) for evaluating the spatial accuracy and precision of the gaze signal retrieved from our toolkit. We discuss our results and compare them to results for state-of-the-art mobile eye trackers from the literature.
Sensors 2021, 21, 2234. https://doi.org/10.3390/s21062234
https://www.mdpi.com/journal/sensors
Sensors 2021, 21, 2234
2 of 18
2. Related Work
Our work is related to other research-oriented toolkits and software solutions for headmounted eye tracking systems, particularly to those targeting VR and AR environments, and to literature on measuring the gaze estimation error.
2.1. AR and VR Eye Tracking
Some toolkits for eye tracking research in VR are available. Tobii offers a solution for eye tracking analysis in VR by providing tools for the integration of eye tracking hardware to HMDs and analysis software for eye tracking research [7]. Another commercial eye tracking add-on is offered by Pupil Labs for the HTC Vive HMD together with opensource software for data analysis [6]. Non-commercial frameworks for eye tracking in AR or VR exist, as well. Stratmann et al. [8] presented EyeMR, a low-cost system for integrating eye tracking into VR based on the Pupil Capture software and a custom Unity 3D framework. Lee et al. [9] also presented a method for low-cost gaze tracking and gaze point estimation in head-mounted devices. Mardanbegi and Pfeiffer [10] presented the EyeMRTK toolkit to develop gaze-based interaction techniques in VR and AR; however, the current implementation is limited to specific VR headsets. Adhanom et al. [11] presented the GazeMetrics tool which provides a standardized approach to measure accuracy and precision in VR settings.
The range of AR eye tracking toolkits is more limited. Pupil Labs [6] offers eye tracking add-ons for the Microsoft HoloLens 1 and the Epson Moverio BT-300, but the analysis software is tailored to mobile eye tracking without HMDs and their integration into the Unity 3D development environment is discontinued (https://github.com/pupil-labs/ hmd-eyes/issues/100#issuecomment-662362737, accessed on 20 November 2020). This limits the usefulness of the offered add-ons and restricts applications to use cases in which no AR integration is required. Recent HMDs, like the Magic Leap 1 [12] and the Microsoft HoloLens 2 [13], are equipped with integrated eye trackers. However, the toolkits and APIs provided by the manufacturers are targeted at gaze-based interaction and not at eye tracking research [14,15]. Still, this enables an easy integration of visual attention into AR applications: using the spatial awareness of the devices provides eye-in-world data which otherwise has to be integrated using additional sensors [16]. We build our toolkit on top of the eye tracking APIs of the HoloLens 2 device [13]. However, all device specific code is encapsulated in a data access layer which enables easy adaption of the toolkit to other eye tracking enabled AR devices.
2.2. Measuring the Gaze Estimation Error
Eye tracking research studies investigate the impact of an intervention on the eye movements of a participant. Typically, the gaze samples or fixations, i.e., the periods for which the eye is relatively still, are used to approximate the human visual attention, and are mapped to areas of interest (AOIs) for analysis. High gaze estimation quality is essential for eye tracking research because errors can heavily undermine the results [17]. However, a key problem in head-mounted eye tracking is that the gaze estimation error, i.e., the difference between the estimated and true gaze position, can be substantial, particularly if participants move and if fixation distances vary [18,19]. Besides user position and orientation, also factors specific to the eye tracker and display, e.g., parameters of the calibration routine and of the display detection algorithm, can have significant impact on the gaze estimation error [20]. Typical metrics for the error of gaze estimation include spatial accuracy and spatial precision [21]. Spatial accuracy is commonly computed as the mean angular deviation of fixations to the actual position, and spatial precision as the root mean square error or standard deviation of individual gaze samples from their centroid [21,22].
Sensors 2021, 21, 2234
3 of 18
3. Augmented Reality Eye Tracking Toolkit
We develop an eye tracking toolkit for augmented reality applications using the Unity 3D game development engine [23]. Our goal is to simplify the access to eye tracking data from the Microsoft HoloLens 2 for research purposes or advanced interaction techniques. We aim at providing raw gaze data robustly at a fixed data rate, without delay, and with highest possible spatial accuracy and precision. For this, we implement an easyto-use interface to control recordings and enable a simple integration into applications and research studies. In addition, we implement a package for the statistical computing environment R for seamless data analysis [24]. The toolkit, a detailed documentation, and an example project are available on GitHub (https://github.com/AR-Eye-TrackingToolkit/ARETT, accessed on 22 March 2021) under the MIT open-source license.
3.1. Overview of HoloLens 2 Technology
We briefly summarize the underlying technology, i.e., the eye tracking hardware and software of the HoloLens 2 which we interface in our data access layer. Similar to other head-mounted eye trackers, the Microsoft HoloLens 2 uses two infrared cameras that yield a close-up view of the wearers eyes [13]. After using the built-in 9-point calibration routine, a closed processing module provides real-time 3D gaze data to developers including a gaze origin and a direction vector. Gaze data can be accessed within Unity 3D via the Mixed Reality Toolkit (MRTK) [14] and via the underlying API for the Universal Windows Platform (UWP) [25]. The MRTK primarily focuses on enabling gaze-based interaction via an easy-to-use API for developers. It does not offer recordings for research purposes, nor does it guarantee a fixed sampling rate which is tied to the Unity3D update rate. Hence, gaze samples might be missed. Our system is based on the API for the UWP which provides unsmoothed data, more stable data rates, and a higher level of control. Further, a high precision timestamp in the system-relative QueryPerformanceCounter (QPC) time domain with a precision of 100 ns is provided for each data point. The manufacturer is vague in reporting specifications related to data quality: the data rate is “approximately 30 Hz” with a spatial accuracy that ranges “approximately within 1.5 degrees” [26].
3.2. Architecture & Components of the Recording Tool
The recording tool of our toolkit is implemented as a package for the Unity 3D game development engine and includes four major components: the generic data provider with the HoloLens-specific data access layer that makes timestamped gaze data available in realtime, the data logger that is responsible for storing the data, the web-based control interface, and a set of utility tools for data visualization. An overview of our systems architecture and the interplay of individual components is shown in Figure 1. In the following, we describe each component in detail and discuss the implementation of egocentric video capture.
Data Access Layer
raw data
get raw data
UWP API Device Specific
Data Provider
processed data
Data Logger
Tools
configure, start/stop recording
enable/ disable
Control Interface
Figure 1. A diagram visualizing the components of the toolkit and their interaction.
Sensors 2021, 21, 2234
4 of 18
The data provider accesses raw eye tracking data using the data access layer, processes it and raises according gaze data events. The data access layer on the HoloLens 2 checks for new gaze samples in a separate thread every 10 ms to reliably obtain all gaze samples from the API with a supposed data rate of 30 Hz, i.e., we expect a new gaze sample every 33.33 ms. This pulling is necessary as no new data event is provided by the API. Each gaze sample includes the origin of the gaze point, its direction vector, and a timestamp. All gaze samples received by the access layer are queued in the data provider and processed in the next frame update in the Unity 3D main thread. For each gaze sample, we cast a ray and check for hits with collider objects in the scene. If the option spatial mapping of the MRTK is enabled for the application, this includes the real environment that is scanned by the depth sensors of the HoloLens 2. If a collider is hit, we extend the gaze sample by the intersection coordinates in the world coordinate system, the objects name, position, rotation and scale, the intersection point in the objects local coordinate system, and the gaze point projection to the 2D eye displays. In addition, we support AOI colliders for real-time gaze-to-AOI mapping with support for dynamic AOIs. AOI collider objects can be placed at any position of a Unity 3D scene or attached to virtual objects in the scene. AOIs must be defined during the application development phase. Real-time and gaze-based adaptations can be realized using custom scripts. Synchronized recordings of the gaze signal and the front-facing camera can be used to define further AOIs post-hoc. We separately cast gaze rays to check for hits with AOI colliders. In addition, we offer an option to store the position, rotation and scaling of game objects in the scene in our gaze sample. This can be used to simulate or visualize sequences of interest post-hoc. For each processed sample, we raise an event that can be subscribed by other components, such as the data logger.
The data logger component provides the option to record all gaze samples. An overview of all recorded data columns can be found in Table 1. The files are named based on the participants pseudonym and a custom recording name. All recordings of a participant are stored in one folder. The gaze data samples are saved as comma separated values (CSV) with one sample per row and the columns as described in Table 1. In addition, we store meta information of the recording, e.g., the start and end time of the recording, in a separate text file in the JSON format. After the recording is started, developers can log additional events in terms of an info string that is stored as part of the gaze sample and in the JSON file. This enables researchers to track custom interaction events, which are of interest to their research question, and session annotations. The recording can be started via function calls and is used in our web-based control interface.
We integrate two utility tools that ease the development, debugging, and monitoring of study prototypes. This includes a tool for visualizing a grid of fixation targets, and one for highlighting AOIs. The grid of fixation targets enables easy collection of gaze samples and corresponding target positions for the evaluation of spatial accuracy and precision. We use this tool in our evaluations: we show nine fixation targets arranged in a 3 × 3 grid at multiple distances from the user. The AOI highlighting helps in debugging dynamic and interactive experiment scenes in which AOIs can move around, appear and disappear during the experiment session. For this, the developer can add custom visualizations which can be dynamically shown and hidden using the control interface.
Our toolkit comes with a web-based control interface (see Figure 2). It enables the experimenter to easily set a participant acronym and a recording name, and to start and stop recordings from any computer in the local network. Further, it provides access to our utility tools and allows an experimenter to add custom annotations to the recording during the study.
Sensors 2021, 21, 2234
5 of 18
Table 1. Overview of recorded data.
Data Column
Time data
eyeDataTimestamp eyeDataRelativeTimestamp frameTimestamp
Gaze data
isCalibrationValid gazeHasValue gazeOrigin_(x/y/z) gazeDirection_(x/y/z) gazePointHit gazePoint_(x/y/z) gazePoint_target_name gazePoint_target_(x/y/z) gazePoint_target_(pos/rot/scale)_(x/y/z) gazePoint(Left/Right/Mono)Screen_(x,y,z) gazePointWebcam_(x,y,z)
AOI data
gazePointAOIHit gazePointAOI_(x/y/z) gazePointAOI_target_name gazePointAOI_target_(x/y/z) gazePointAOI_target_(pos/rot/scale)_(x/y/z) gazePointAOIWebcam_(x,y,z)
Additional information
gameObject_objectName_(pos/rot/scale)_(x/y/z) info
Description
Unix timestamp of the gaze data (in ms) Relative timestamp of the gaze data (in ms, 100 ns precision) Unix timestamp of the frame in which the data was processed (in ms)
Flag if the calibration of the wearer is valid Flag if valid gaze data exists (origin/direction) Gaze origin in the global reference frame Gaze direction in the global reference frame Flag if the raycast hit an object and a gaze position exists Position of the gaze point in the global reference frame Name of the game object hit by the gaze ray Position of the gaze point in the local reference frame of the hit object Position, rotation, and scale of the game object hit by the gaze ray Position of the gaze point on the left, right and virtual mono display Position of the gaze point on the webcam image
Flag if the gaze ray hit an AOI Position of the gaze point on the AOI in global coordinates Name of the game object representing the AOI Position of the gaze point in the local reference frame of the AOI Position, rotation, and scale of the game object hit by the AOI ray Position of the gaze point on the AOI on the webcam image
Position, rotation, and scale of selected game objects Info string of a logged event
Figure 2. Screenshot of the control interface accessible over the network.
Sensors 2021, 21, 2234
6 of 18
Typically, head-mounted eye trackers use a world camera to record the environment from an egocentric perspective and map the wearers pupil positions to the corresponding video frames. The integrated eye tracker of the Microsoft HoloLens 2, however, maps pupil positions to gaze rays in the 3D coordinate system of the device.
Our toolkit adds a virtual camera to the 3D scene that matches the location, projection, and resolution of the integrated front-facing camera. This enables the projection of the 3D gaze position to the virtual 2D camera image and, hence, to the webcam image. The virtual camera is preconfigured to match the integrated, front-facing webcam of the HoloLens 2. We recommend to check the configuration per use case and to adapt it, if the camera specifications differ. The 2D gaze signal is reported via the gaze sample event of the data provider and recorded in the gazePointWebcam column.
If video streaming or capturing for demonstration purposes is required only, the Mixed Reality Capture (MRC) module of HoloLens 2 can be used. It streams or records an egocentric video with an overlay showing the virtual content [27]. Our toolkit supports gaze visualization in this module by attaching a small sphere to the current gaze position that is visible in the capture but not to the user. However, this method is computationally demanding which constrains the framerate for all applications to 30 frames per second and has a negative impact on real-time interactive applications which limits its use to demonstration purposes.
3.3. R Package for Data Analysis
We implement an R package for seamless data analysis of recordings from our recording tool. Existing data analysis tools are primarily targeted at stationary eye trackers that yield a two-dimensional gaze signal or mobile eye trackers that report gaze with respect to an egocentric video feed [5,2830]. Our toolkit reports three dimensional gaze data with a world-centered coordinate system. We provide a new R package that supports this data paradigm. It offers offline fixation detection with corresponding pre- and post-processing routines. The R package and detailed documentation is published on GitHub (https://github.com/AR-Eye-Tracking-Toolkit/ARETT-R-Package, accessed on 22 March 2021) under the MIT open-source license.
We implement two functions for pre-processing the raw gaze data, gap fill and noise reduction, similar to Reference [31]. The gap fill function linearly interpolates between valid gaze points with small gaps in between, e.g., due to loss of tracking. The noise reduction function applies a mean or median filter to the gaze data with a given window size.
Three methods from the literature for offline fixation detection are implemented. This includes I-VT using a velocity threshold similar to Reference [31], I-DT for VR as described by Llanes-Jurado et al. [32] using a dispersion threshold, and I-AOI proposed by Salvucci and Goldberg [33] based on detected areas of interest. Our implementation of I-VT follows the description by Olsen [31]. It reproduces a similar behavior based on the data recorded using our toolkit. We calculate a velocity for each gaze point over a specified duration and categorize the points by comparing the velocities to a specified threshold. I-DT follows the implementation by Llanes-Jurado et al. [32]. It computes the angular dispersion distance over a window of a specific size in terms of its duration. If the initial window exceeds this threshold it is moved forward until it does not exceeded the threshold. Then, the window is extended to the right until the dispersion threshold is exceeded. All samples in the window, excluding the last sample, are classified as belonging to a fixation. Afterwards, a new window is initialized at the position of the last gaze sample. These steps are repeated until all samples are classified. The I-AOI method for fixation detection is based on Salvucci and Goldberg [33]. It differs from the other methods as it classifies fixations based on predefined areas of interest. First, all gaze points within an AOI are classified as belonging to a fixation. Next, groups of fixation samples are identified as a fixation event using a minimum duration threshold. Short events are discarded.
In addition, we provide two functions for post-processing of detected fixations: merging adjacent fixations and discarding short fixations. The merge adjacent fixations function
Sensors 2021, 21, 2234
7 of 18
merges subsequent fixations if the gap is smaller than a defined maximum duration and, depending on the detection algorithm used, a maximum angle between them (I-VT) or a maximum dispersion distance (I-DT). For I-AOI, the two fixations must belong to the same AOI. The discard short fixations function removes short fixations based on a minimum fixation duration and is mainly interesting for the I-VT method because both other methods inherently contain a minimum fixation duration.
4. Evaluation of Accuracy and Precision
High eye tracking data quality is important for eye tracking research because errors in the gaze estimation process can undermine the validity of reported results [17]. However, for the integrated eye tracker of the Microsoft HoloLens 2 we only find limited information about spatial accuracy and no information about spatial precision [26]. We conduct a user study to analyze the accuracy and precision of gaze data from the HoloLens 2 that is recorded using our toolkit. We ask participants to fixate a set of targets, which have a static position with respect to the participants head, at different distances. We record the gaze signal while the participants are seated (setting I) or walking (setting II). Further, we ask them to fixate a target with a static world position while moving around (setting III). The results can serve as a reference for researchers when designing eye tracking studies, e.g., to decide whether the accuracy is sufficient, or to influence the position and size of AOIs. In addition, our results can guide interaction designers that develop gaze-based AR applications, for example to improve gaze-based selection [34].
4.1. Participants
In total, we recruited 21 participants (7 or 33% female; mean age 29.5, SD = 8.5) of which 15 participated in all three settings. Two participants skipped setting III and four participants finished setting III only. This totals to 17 participants for settings I and II (4 or 24% female; mean age 29.1, SD = 8.6), and 19 participants for setting III (7 or 37% female; mean age 30, SD = 8.8). All participants had normal or corrected-to-normal vision with one participant wearing contact lenses and three participants wearing glasses.
4.2. Conditions & Tasks
In our study, we include three settings in which we record the participants gaze signal and the position of multiple fixation targets. In setting I and II, we show a planar 9-point grid of fixation targets (3 × 3) that is centered in front of the participants head and orthogonal to the forward direction (Figure 3a). Participants are standing still in setting I, and walking forward and backward in setting II during the recording phase. For both settings, the grid size is aligned to the field of view of the device. The outer fixation targets are positioned at the border of the field of view such that both eyes can see them. The distances between the corner targets (upper left, upper right, lower left, lower right) and the center target are 18.25 degrees of visual angle. The distances for the edge targets (upper center, middle left, middle right, lower center) are 12.13 degrees of visual angle. In addition, we vary the distance d of the grid for both settings: we include d ∈ {0.5 m, 1 m, 2 m, 4 m}. For all distances, we ask the participants to fixate all targets for three seconds, starting on the upper left in a left-to-right and top-to-bottom direction. An example picture of the settings I and II can be found in Figure 4. For setting III, we place a single fixation target at a static position in the world coordinate system: we show a sphere with diameter of 1 cm 15 cm above the surface on a table with a height of 75 cm (Figure 3b). Participants are seated in front of the table and are asked to move their heads left and right while keeping up the fixation to the sphere. With this setting, we simulate vestibulo-ocular reflex movements that are common in natural experiment settings in which participants interact with stationary AR content.
Sensors 2021, 21, 2234
8 of 18
(a) Mixed reality photo of setting I and II
(b) Mixed reality photo of setting III
Figure 3. Mixed reality photo of our HoloLens 2 applications for all three settings which are presented to the participants. The fixation grid for settings I and II is displayed at a fixed distance from the user and resized such that the angular size is identical for all distances (a). The sphere in setting III is positioned 15 cm above the table and stays fixed on top of the visual marker when the participant moves (b). These screenshots are 2D projections which do not reflect the field-of-view and depth perception of a participant in augmented reality (AR).
Figure 4. Example of setting I and II in our study with the participant wearing a Microsoft HoloLens 2 and the supervisor controlling the recording using our toolkit.
4.3. Procedure
All settings are recorded in one session, starting with setting I and immediately followed by setting II and III. The order of the settings was identical for all participants. In the beginning of a session, the participant puts on the device which is adjusted to the head by a supervisor. The device is fitted to a participants head such that it does not move during the experiment but is still comfortable to wear. If the participant feels that the device loosens, it is tightened by the supervisor. During the whole procedure, the device is not moved on or removed from the participants head. After fitting, the integrated eye tracker is calibrated using the built-in calibration routine. We record gaze data and reference target positions with our new toolkit. Each task is recorded separately, resulting in a recording per distance for setting I and II, and a single recording for setting III. For settings I and II, we perform a manual fixation detection and remove gaze samples that belong to a saccade event. We performed a manual annotation of the gaze signal to extract fixations more accurately than possible with automatic algorithms which have, in particular, problems with event detection in mobile eye tracking signals [35]. Gaze samples are labeled as
Sensors 2021, 21, 2234
9 of 18
belonging to a fixation unless the gaze position moved away from the fixation center, i.e., when turning into a saccade which ends at the next fixation center. The labeling is based on visual inspections from one expert. For setting III, we remove gaze samples before the participant starts fixating the sphere and moving his/her head, and after the participant stops. The participant is asked by the supervisor to start the movement and, after four minutes, asked to stop moving and to return to the starting position.
4.4. Metrics
We define spatial accuracy and precision according to the literature [34,36]. Per target, we compute spatial accuracy as the distance between the mean gaze sample and the target position. Spatial precision is computed as the standard deviation of the distances between each gaze sample and the mean position of all gaze samples. We report both measures in cm, as well as in degrees of visual angle. The distance in cm is calculated using the distance between the gaze point and the target based on their positions in the reference frame provided by Unity 3D. The visual angle is calculated as the angle between the reported 3D gaze ray from the gaze origin to the gaze point and the 3D ray from the gaze origin to the target position.
4.5. Hypotheses
Previous research on the gaze estimation error in head-mounted eye tracking reported significant differences in the spatial accuracy for varying distances and when moving around versus resting [18,20]. We expect similar characteristics for the integrated eye tracker of the Microsoft HoloLens 2. Hence, we hypothesize that the spatial accuracy is dependent on the distance of the fixation target (H1). Further, we expect a lower accuracy for setting II in which participants move than for setting I in which they are resting (H2). Similar to H2, we expect that spatial precision is lower for setting II, i.e., when participants move (H3). For setting III, we exploratively investigate the spatial accuracy and precision for a realistic research setting from educational sciences.
4.6. Results
A total of 335,867 gaze points are recorded over all participants in all three settings and before filtering. Analyzing the relative timestamp provided by the device, the mean difference between timestamps is 33 ms (SD 1 ms). One hundred and seventy-one of these gaze points show a time difference to the previous gaze point larger than 34 ms, and 27 gaze points show a difference smaller than 32 ms. Those with a difference larger than 34 ms are multiples of the expected 33.33 ms. All gaze points with a difference smaller than 32 ms have a difference of 0 ms. After removing the 198 gaze points with erroneous timing, we see a mean difference between timestamps of 33.33 ms (SD 2.5 × 104 ms).
For setting I, we report the metrics for all targets which include, on average, 108.47 (SD = 43.04) gaze points after saccade removal. Table 2 shows the spatial accuracy and precision per distance, averaged over all nine fixation targets and participants. The mean angular accuracy over all distances is 0.83 degrees with a precision of 0.27 degrees. Figure 5 visualizes the error for individual targets per distance. A visualization of the analyzed gaze positions of one participant at the upper left target can be found in Figure 6. A Shapiro-Wilk test shows that the means of accuracies in degrees of visual angle over all targets is not distributed normally for all distances but 2.0 m, p0.5 = 0.01, p1.0 = 0.03, p2.0 = 0.12, p4.0 = 0.03. To evaluate the difference in spatial accuracy over all distances we conduct a Friedman test. It shows a significant difference in accuracy between the different distances, χ2(3) = 20.15, p < 0.001. Post hoc analysis with Wilcoxon signed-rank tests is conducted with a Bonferroni correction applied, resulting in a significance level set at p < 0.008. It reveals a significant difference in the accuracy between the distance 0.5 m and the distances 2.0 m and 3.0 m (Table 3).
Sensors 2021, 21, 2234
10 of 18
Distance 0.5 m
Accuracy in degree for every calibration target while resting
Distance 1.0 m
Distance 2.0 m
Distance 4.0 m
Accuracy °
4 3 2 1 0
Figure 5. Plot of the mean accuracy at each distance for each target in setting I—resting. The accuracy angle for all targets is smaller than 1.5 degrees.
y distance in cm
Distance to target: 0.5m Accuracy: 0.98 cm, Precision (SD): 0.34 cm
8 6 4 2 0 2 4 6 8
8 6 4 2 0 2 4 6 8 x distance in cm
y distance in cm
Upper left target while resting
Distance to target: 1.0m
Distance to target: 2.0m
Accuracy: 2.25 cm, Precision (SD): 0.86 cm
Accuracy: 3.71 cm, Precision (SD): 1.53 cm
8
8
6
6
4
4
y distance in cm
2
2
0
0
2
2
4
4
6
6
8
8 6 4 2 0 2 4 6 8 x distance in cm
8
8 6 4 2 0 2 4 6 8 x distance in cm
y distance in cm
Distance to target: 4.0m Accuracy: 0.66 cm, Precision (SD): 3.41 cm
8 6 4 2 0 2 4 6 8
8 6 4 2 0 2 4 6 8 x distance in cm
Accuracy: 1.081 °, Precision (SD): 0.203 °
Accuracy: 1.213 °, Precision (SD): 0.324 °
Accuracy: 0.978 °, Precision (SD): 0.293 °
Accuracy: 0.090 °, Precision (SD): 0.205 °
2.5
2.5
2.5
2.5
2.0
2.0
2.0
2.0
1.5
1.5
1.5
1.5
1.0
1.0
1.0
1.0
0.5
0.5
0.5
0.5
distance in degree
distance in degree
distance in degree
distance in degree
Figure 6. Recorded gaze point of one participant in relation to the upper left target in setting I—resting. The red dot represents the mean gaze position with each cross being one recorded gaze point.
Table 2. Accuracy and precision for setting I—resting.
Distance
0.5 m 1.0 m 2.0 m 4.0 m
Accuracy (SD)
in cm
in deg
0.91 (0.41) 1.56 (0.83) 2.85 (1.31) 5.03 (2.27)
1.00 (0.44) 0.85 (0.46) 0.77 (0.35) 0.68 (0.31)
Precision (SD)
in cm
in deg
0.40 (0.16) 0.67 (0.24) 1.35 (0.49) 3.12 (1.26)
0.29 (0.13) 0.25 (0.11) 0.24 (0.10) 0.28 (0.12)
Table 3. Results of the post hoc Wilcoxon signed-rank tests for setting I—resting. * the Bonferroni corrected significane level is p < 0.008.
Comparison
Z p
0.5 m 1.0 m
2.63 0.009
0.5 m 2.0 m
3.57 <0.001 *
0.5 m 4.0 m
3.43 0.001 *
1.0 m 2.0 m
1.68 0.093
1.0 m 4.0 m
2.06 0.039
2.0 m 4.0 m
1.44 0.149
The recordings for setting II include an average of 121.23 (SD = 32.53) gaze samples per target. The mean spatial accuracy, averaged over participants and fixation targets per distance, is reported in Table 4. The mean angular accuracy over all distances is 1.77 degrees with a precision of 1.13 degrees. The results per fixation target are visualized in Figure 7.
Sensors 2021, 21, 2234
11 of 18
Distance 0.5 m
A visualization of the analyzed gaze positions of one participant at the upper left target can be found in Figure 8. The mean accuracy in degrees of visual angle over all targets is distributed normally for the distances 0.5 m and 4.0 m, but not at 1.0 m and 2.0 m as assessed by a Shapiro-Wilk test, p0.5 = 0.44, p1.0 = 0.01, p2.0 = 0.04, p4.0 = 0.35. Analogue to setting I we conduct a Friedman test to evaluate the difference in spatial accuracy over all distances. It shows a significant difference in accuracy between the different distances, χ2(3) = 37.02, p < 0.001. The Bonferroni corrected post hoc analysis with Wilcoxon signed-rank tests results in a significance level set at p < 0.008. It reveals a significant difference in spatial accuracy for all paired comparisons except for the distances 2.0 m and 4.0 m (Table 5).
Accuracy in degree for every calibration target while walking
Distance 1.0 m
Distance 2.0 m
Distance 4.0 m
Accuracy °
4 3 2 1 0
Figure 7. Plot of the mean accuracy at each distance for each target in setting II—walking.
Distance to target: 0.5m Accuracy: 2.72 cm, Precision (SD): 1.83 cm
15 12
9 6 3 0 3 6 9 12 15
15 12 9 6 3 0 3 6 x distance in cm
9 12 15
Accuracy: 2.672 °, Precision (SD): 1.206 °
y distance in cm
Upper left target while walking
Distance to target: 1.0m
Distance to target: 2.0m
Accuracy: 2.02 cm, Precision (SD): 2.18 cm
Accuracy: 3.75 cm, Precision (SD): 3.41 cm
15 12
9 6 3 0 3 6 9 12 15
15 12 9 6 3 0 3 6 x distance in cm
9 12 15
y distance in cm
15 12
9 6 3 0 3 6 9 12 15
15 12 9 6 3 0 3 6 x distance in cm
9 12 15
Accuracy: 1.045 °, Precision (SD): 0.680 °
Accuracy: 0.966 °, Precision (SD): 0.648 °
y distance in cm
Distance to target: 4.0m Accuracy: 5.57 cm, Precision (SD): 4.19 cm
15 12
9 6 3 0 3 6 9 12 15
15 12 9 6 3 0 3 6 x distance in cm
9 12 15
Accuracy: 0.742 °, Precision (SD): 0.284 °
5
5
5
5
4
4
4
4
3
3
3
3
2
2
2
2
1
1
1
1
y distance in cm
distance in degree
distance in degree
distance in degree
distance in degree
Figure 8. Recorded gaze point of one participant in relation to the upper left target in setting II—walking. The red dot represents the mean gaze position with each cross being one recorded gaze point.
Table 4. Accuracy and precision for setting II—walking.
Distance
0.5 m 1.0 m 2.0 m 4.0 m
Accuracy (SD)
in cm
in deg
2.29 (0.64) 3.35 (1.50) 5.07 (1.94) 9.75 (3.08)
2.52 (0.69) 1.84 (0.81) 1.39 (0.53) 1.33 (0.42)
Precision (SD)
in cm
in deg
1.89 (0.34) 3.33 (1.00) 6.32 (1.52) 12.58 (3.19)
1.31 (0.25) 1.16 (0.47) 1.03 (0.27) 1.03 (0.32)
Sensors 2021, 21, 2234
12 of 18
Table 5. Results of the post hoc Wilcoxon signed-rank tests for setting II—walking. * the Bonferroni corrected significance level is p < 0.008.
Comparison
Z p
0.5 m 1.0 m
3.432 0.001 *
0.5 m 2.0 m
3.621 <0.001 *
0.5 m 4.0 m
3.621 <0.001 *
1.0 m 2.0 m
3.574 <0.001 *
1.0 m 4.0 m
2.817 0.005 *
2.0 m 4.0 m
0.686 0.492
In addition, we compare the spatial accuracy and precision results between setting I (resting) and setting II (walking). The differences in accuracy are not distributed normally for the distances 0.5 m and 1.0 m as assessed by a Shapiro-Wilk test, p0.5 = 0.04, p1.0 = 0.003, p2.0 = 0.26, p4.0 = 0.44. A Wilcoxon signed-rank test shows that the accuracy differs significantly between setting I and II for all distances (Table 6). The difference in precision is distributed normally for the distance of 0.5 m but not for the other distances as assessed by a Shapiro-Wilk test, p0.5 = 0.44, p1.0 < 0.001, p2.0 = 0.046, p4.0 = 0.003. A Wilcoxon signed-rank test shows that the precision differs significantly between setting I and II for all distances (Table 7).
Table 6. Results of the Wilcoxon signed-rank tests for the comparison of the accuracy between setting I and II.
Distance
Z p
0.5 m
3.62 <0.001
1.0 m
3.62 <0.001
2.0 m
3.57 <0.001
4.0 m
3.53 <0.001
Table 7. Results of the Wilcoxon signed-rank tests for the comparison of the precision between setting I and II.
Distance
Z p
0.5 m
3.62 <0.001
1.0 m
3.62 <0.001
2.0 m
3.62 <0.001
4.0 m
3.62 <0.001
For setting III, we include a mean of 641.79 (SD = 262.10) gaze samples per participant for our analysis. The resulting accuracy and precision values together with the mean distance of the participants from the target can be found in Table 8. We approximate the spatial accuracy in degrees of visual angle as using the following formula: θ = tan1(O/d) with the accuracy in cm as O and the mean distance to the participant d. The same formula is used to calculate the precision by using the precision in cm as O. A 3D visualization of the analyzed gaze positions of one participant can be found in Figure 9.
Table 8. Accuracy, precision, and mean distance for setting III—stationary target.
Distance (SD) in cm
49.87 (13.53)
Accuracy (SD)
in cm
in deg
0.34 (0.27)
0.39 (0.31)
Precision (SD)
in cm
in deg
0.87 (0.35)
1.00 (0.40)
Sensors 2021, 21, 2234
13 of 18
Stationary target
2
1
y distance in cm
z distance in cm
0
1
2 2
2 1
1 x distance0 in cm 1
0 1
2 2
Distance in deg
0
1
2
3
4
Figure 9. Recorded gaze point of one participant in setting III—stationary target. The distance angle for all gaze points is smaller than 3 degrees.
5. Discussion
The major goal of developing the augmented reality eye tracking toolkit is to enable researchers to easily use eye tracking in AR settings with the HoloLens 2. It should allow an efficient integration to Unity 3D scenes, enable recordings of a comprehensive set of eye tracking signals (see Table 1), and a seamless analysis of the data via our R package. This would simplify integration of eye tracking into existing AR research, like Strzys et al. [37] and Kapp et al. [38]. Independently from the study reported in this publication, our toolkit is currently being used in two ongoing research studies which provide first evidences in this direction. One study utilizes the Microsoft HoloLens 2 to display two dimensional plots at a fixed distance while the participant is moving while another study investigates stationary augmentations on a table. The initial feedback from the study organizers, the developers of the AR application, and the experimenters is positive. No major issues occurred during the recordings, which certifies a high robustness, and the ease-of-use of the web interface was, informally, rated high.
Our toolkit can also be used for facilitating gaze-based interaction and real-time adaptive applications using the data provider module. For instance, prior research proposed to use eye tracking and HMDs to augment the episodic memory of dementia patients by storing artificial memory sequences and presenting them when needed [39]. Other works include approaches for gaze-based analysis of the users attention engagement and cognitive states for proactive content visualization [40], and multi-focal plane interaction, such as object selection and manipulation at multiple fixation distances [41]. It can also be used in research regarding selection techniques in AR [42,43]. The utility of the toolkit for realizing real-time adaptive applications has been shown in Reference [44]. The presented prototype uses video and gaze information via our toolkit to automatically recognize and augment attended objects in an uninstrumented environment.
Sensors 2021, 21, 2234
14 of 18
5.1. Evaluation of Accuracy and Precision
The results from our evaluation show significant differences in spatial accuracy for varying distances in setting I and II. This supports our hypothesis H1. However, for setting I, the pairwise comparisons reveal that only the results for the smallest distance 0.5 m and the distances 2.0 and 4.0 m differ significantly. For setting II, the results significantly differ for all pairs except for the two farthest distances of 2.0 m and 4.0 m. Further, our results confirm the hypothesis H2 and H3: the accuracies and precision for each distance differ significantly between setting I and setting II while the results for setting II are poorer.
Our observations also show that the spatial accuracy in degrees of visual angle increases with increasing distance (see Tables 2 and 4). Findings from the literature suggest that the accuracy decreases with increasing deviation from the calibration distance, i.e., the distance at which the fixation targets of the calibration routine are shown [18,20,45]. This leads to our assumption that the built-in calibration routine of HoloLens 2 is placed at 2 to 4 m from the user, which is supported by the fact that Microsoft recommends an interaction distance of 2 m [46]. It is possible that this increase in angular accuracy is an effect of the vergence-accommodation conflict [47] as only a combined gaze ray is made available by the device.
The official HoloLens 2 documentation reports a vague range for the spatial accuracy of “approximately within 1.5 degrees” with “slight imperfections” to be expected [26]. Basically, our results coincide with these specifications, but are much more fine-grained. For the resting setting (I), we observe better spatial accuracy values ranging from 1.00 degrees of visual angle for a 0.5 m distance to 0.68 degrees for 4.0 m. For the walking setting (II), which has a lower spatial accuracy overall, the results for 0.5 m and 1.0 m are outside the official range with 2.52 and 1.84 degrees of visual angle, respectively. The two other conditions lie within the specified boundary of 1.5 degrees. The documented sampling rate of “approximately 30 Hz” was also met with a new gaze sample being observed every 33.33 ms.
Based on our findings, we suggest minimum target sizes for eye tracking research and gaze-based interaction with the HoloLens 2. Similar to Feit et al. [34], who investigate the gaze estimation error for remote eye tracking, we calculate the minimum size such that 95% of all gaze samples hit the target. We use their formula that computes the minimum size based on a 2-dimensional Gaussian function as S = 2(O + 2σ) with the spatial accuracy of the eye tracker as offset O and the spatial precision of the gaze signal as σ. The resulting minimum target sizes for varying distances are listed in Table 9. For a distance of 2.0 m, Microsoft recommends a target size of 510 cm, which conforms with our findings for setting I: we suggest a target size of 11.10 cm in this case. However, if the participant is meant to move around, the targets should be significantly larger.
Table 9. Recommended minimum target size in cm based on Feit et al. [34] and the identified accuracy and precision.
Distance
0.5 m 1.0 m 2.0 m 4.0 m
Setting I (Resting)
3.42 cm 5.80 cm 11.10 cm 22.54 cm
Setting II (Walking)
12.14 cm 20.02 cm 35.42 cm 69.82 cm
In setting III, we explore the characteristics of the gaze estimation error for stationary targets. The average distance to the stationary target of 49.87 cm is comparable to the 0.5 m distance in setting I. However, the mean spatial accuracy is better and the precision is lower. This better result for spatial accuracy could be explained by the longer fixation durations and the varying viewing angles in setting III: on average, the mean gaze positions seem to balance around the fixation target, while the dispersion stays high (see Figure 9). Based on Feit et al. [34], we suggest a minimum target size of 4.16 cm. This is 22% larger than the recommendation for setting I, and 34% of the recommended size for setting II. Altogether,
Sensors 2021, 21, 2234
15 of 18
the results suggest that the fixation duration and the user condition, i.e., walking versus not walking, influences the spatial accuracy and precision, which should be considered when designing interactive and, potentially, mobile research applications.
Finally, we compare the results of the HoloLens 2 eye tracker to available headmounted eye trackers without an HMD. Macinnes et al. [48] evaluated the spatial accuracy and precision of three mobile eye trackers for multiple distances while participants were seated. They included (i) the Pupil Labs 120 Hz Binocular glasses with an accuracy of 0.84◦ and a precision of 0.16◦, (ii) the SensoMotoric Instruments (SMI) Eye Tracking Glasses 2 with an accuracy of 1.21◦ and a precision of 0.19◦, and (iii) the Tobii Pro Glasses 2 with an accuracy of 1.42◦ and a precision of 0.34◦. On average, our results for the HoloLens 2 in setting I, which is the closest match to the setting in Reference [48], yield an accuracy of 0.83◦ and a precision of 0.27◦. This is similar to the results of the Pupil Labs glasses that ranged best in the experiment by Macinnes et al. [48] and suggests that the eye tracking data from HoloLens 2 can effectively be used in research experiments. However, one drawback is that the sampling rate of 30 Hz is lower compared to the devices evaluated in their experiment.
5.2. Limitations
Our toolkit enables access to raw gaze data and provides additional tools for processing them. However, it is limited to the data that is made available through APIs of the device. For instance, the API reports a joint gaze vector for both eyes, while many commercial binocular eye tracking glasses report separate gaze rays. This forces to intersect the gaze ray with the virtual environment to receive a point of gaze. Separate rays can be intersected to extract gaze points without intersecting any surface, and to infer the fixation depth. In addition, this gaze point can be used to find close-by AOIs. Our evaluation focuses on limited set of interaction settings that probably do not generalize to all possible settings in AR environments. However, with setting III, we include a more realistic setting that closer matches typical AR environments with a moving user and fixed visualizations. We cannot rule out effects due to the experiment order as it was identical for all participants.
Currently, our toolkit is constrained to the Microsoft HoloLens 2 as eye tracking device. However, all device specific functionality is encapsulated in the data access layer. This makes it possible to adapt the toolkit to other eye tracking enabled AR devices in the future. However, the gaze estimation is device-specific: the results from our evaluation on spatial accuracy and spatial precision do not hold for other devices. In addition, the sampling rate might change which needs to be addressed by re-configuring the data pulling rate. The data access layer could also subscribe gaze events or connect to a signal stream, if this is supported by the new device.
6. Conclusions
In this work, we presented an open-source toolkit that enables eye tracking research in AR using the HoloLens 2 device. We addressed the gap of missing research tools by implementing a Unity 3D package for reliable gaze data acquisition and an R package for seamless data analysis. We received first positive feedback on our toolkit from two other research studies, proving its utility. We conducted a user study (n = 21) to investigate the spatial accuracy and spatial precision of gaze data from our toolkit. The results suggest that the spatial accuracy increases when increasing the distance of fixation targets. Further, we found evidence that spatial accuracy and precision drop when participants are walking compared to standing still. Overall, the gaze estimation error is similar to recent headmounted eye trackers without HMDs which shows the suitability of our toolkit for research applications. In future updates we will address the limitations of our toolkit as follows. We plan to add fully integrated support for video recording of the integrated camera using the data logger, as well as real-time streaming of video and gaze data. We will also investigate the effectiveness of attaching virtual AOIs to real objects for real-time gaze-to-AOI mapping.
Sensors 2021, 21, 2234
16 of 18
Further, we want to extend the functionality of our R package and integrate interfaces to existing gaze data processing tools, as well as integrate data access layers for other devices.
Author Contributions: Conceptualization, methodology, software, formal analysis, data curation, visualization, supervision, S.K.; investigation, S.K. and S.M.; writing—original draft preparation, S.K. and M.B.; writing—review and editing, S.K., M.B., S.M., D.S., J.K.; resources, J.K.; project administration, funding acquisition, D.S. and J.K. All authors have read and agreed to the published version of the manuscript.
Funding: This research was funded by the German Federal Ministry of Education and Research (Bundesministerium für Bildung und Forschung; BMBF) via the project “GeAR” (Grant No. 01JD1811B and 01JD1811C).
Institutional Review Board Statement: Not applicable.
Informed Consent Statement: Informed consent was obtained from all subjects involved in the study.
Data Availability Statement: The data presented in this study are available on request from the corresponding author. The data are not publicly available due to data privacy.
Conflicts of Interest: The authors declare no conflict of interest. The funders had no role in the design of the study; in the collection, analyses, or interpretation of data; in the writing of the manuscript, or in the decision to publish the results.
References
1. Majaranta, P.; Bulling, A. Eye Tracking and Eye-Based HumanComputer Interaction. In Advances in Physiological Computing; Fairclough, S.H., Gilleade, K., Eds.; HumanComputer Interaction Series; Springer: London, UK, 2014; pp. 3965. [CrossRef]
2. Blattgerste, J.; Renner, P.; Pfeiffer, T. Advantages of eye-gaze over head-gaze-based selection in virtual and augmented reality under varying field of views. In Proceedings of the Workshop on Communication by Gaze Interaction—COGAIN 18; Morimoto, C., Pfeiffer, T., Eds.; ACM Press: New York, NY, USA, 2018; pp. 19. [CrossRef]
3. Guenter, B.; Finch, M.; Drucker, S.; Tan, D.; Snyder, J. Foveated 3D graphics. ACM Trans. Graph. 2012, 31, 110. [CrossRef] 4. Patney, A.; Salvi, M.; Kim, J.; Kaplanyan, A.; Wyman, C.; Benty, N.; Luebke, D.; Lefohn, A. Towards foveated rendering for
gaze-tracked virtual reality. ACM Trans. Graph. 2016, 35, 112. [CrossRef] 5. Tobii Pro AB. Pro Lab User Manual. Available online: https://www.tobiipro.com/siteassets/tobii-pro/user-manuals/Tobii-Pro-
Lab-User-Manual/?v=1.152 (accessed on 12 November 2020). 6. Pupil Labs. Add Awareness to Your VR/AR Experience: Integrate and React. Available online: https://pupil-labs.com/
products/vr-ar/ (accessed on 20 November 2020). 7. Tobii VR. Tobii VR: Discover New Possibilities with Eye Tracking in VR. Available online: https://vr.tobii.com/ (accessed on
20 November 2020). 8. Stratmann, T.C.; Gruenefeld, U.; Boll, S. EyeMR—Low-cost Eye-Tracking for Rapid-prototyping in Head-mounted Mixed Reality.
In Proceedings of the 2018 ACM Symposium on Eye Tracking Research & Applications; Sharif, B., Krejtz, K., Eds.; ACM: New York, NY, USA, 2018; pp. 12. [CrossRef] 9. Lee, K.F.; Chen, Y.L.; Yu, C.W.; Chin, K.Y.; Wu, C.H. Gaze Tracking and Point Estimation Using Low-Cost Head-Mounted Devices. Sensors 2020, 20, 1917. [CrossRef] [PubMed] 10. Mardanbegi, D.; Pfeiffer, T. EyeMRTK: A Toolkit for Developing Eye Gaze Interactive Applications in Virtual and Augmented Reality. In Proceedings of the 11th ACM Symposium on Eye Tracking Research & Applications; Krejtz, K., Sharif, B., Eds.; ACM: New York, NY, USA, 2019; pp. 15. [CrossRef] 11. Adhanom, I.B.; Lee, S.C.; Folmer, E.; MacNeilage, P. GazeMetrics: An Open-Source Tool for Measuring the Data Quality of HMD-based Eye Trackers. In ACM Symposium on Eye Tracking Research and Applications; Bulling, A., Huckauf, A., Jain, E., Radach, R., Weiskopf, D., Eds.; ACM: New York, NY, USA, 2020; pp. 15. [CrossRef] 12. Magic Leap. Magic Leap 1: A Thousand Breakthroughs in One. Available online: https://www.magicleap.com/en-us/magicleap-1 (accessed on 20 November 2020). 13. Microsoft. HoloLens 2: A New Reality for Computing. Available online: https://www.microsoft.com/en-us/hololens (accessed on 20 November 2020). 14. Microsoft. Eye Tracking in the Mixed Reality Toolkit. Available online: https://microsoft.github.io/MixedRealityToolkit-Unity/ Documentation/EyeTracking/EyeTracking_Main.html (accessed on 17 November 2020). 15. Magic Leap. Eye Gaze. Available online: https://developer.magicleap.com/en-us/learn/guides/design-eye-gaze (accessed on 20 November 2020). 16. Hausamann, P.; Sinnott, C.; MacNeilage, P.R. Positional head-eye tracking outside the lab: An open-source solution. In ACM Symposium on Eye Tracking Research and Applications; Bulling, A., Huckauf, A., Jain, E., Radach, R., Weiskopf, D., Eds.; ACM: New York, NY, USA, 2020; pp. 15. [CrossRef]
Sensors 2021, 21, 2234
17 of 18
17. Holmqvist, K.; Andersson, R. Eye Tracking: A Comprehensive Guide to Methods, Paradigms and Measures; Lund Eye-Tracking Research Institute: Lund, Sweden, 2011.
18. Mardanbegi, D.; Hansen, D.W. Parallax error in the monocular head-mounted eye trackers. In Proceedings of the 2012 ACM Conference on Ubiquitous Computing; ACM: New York, NY, USA, 2012; pp. 689694. [CrossRef]
19. Barz, M.; Stauden, S.; Sonntag, D. Visual Search Target Inference in Natural Interaction Settings with Machine Learning. In Proceedings of the 2020 ACM Symposium on Eye Tracking Research & Applications; Association for Computing Machinery: New York, NY, USA, 2020; pp. 18. [CrossRef]
20. Barz, M.; Daiber, F.; Bulling, A. Prediction of Gaze Estimation Error for Error-Aware Gaze-Based Interfaces. In Proceedings of the Ninth Biennial ACM Symposium on Eye Tracking Research & Applications; ACM Press: New York, NY, USA, 2016; pp. 275278. [CrossRef]
21. Holmqvist, K.; Nyström, M.; Mulvey, F. Eye tracker data quality: What it is and how to measure it. In Proceedings of the Symposium on Eye Tracking Research and Applications; ACM: New York, NY, USA, 2012; pp. 4552. [CrossRef]
22. Barz, M.; Bulling, A.; Daiber, F. Computational Modelling and Prediction of Gaze Estimation Error for Head-Mounted Eye Trackers; Technical Report; DFKI: Kaiserslautern, Germany, 2015.
23. Unity Technologies. Unity Real-Time Development Platform|3D, 2D VR & AR Engine. Available online: https://unity.com/ (accessed on 23 February 2021).
24. The R Foundation. R: The R Project for Statistical Computing. Available online: https://www.r-project.org/ (accessed on 23 February 2021).
25. Microsoft. EyesPose Class. Available online: https://docs.microsoft.com/de-de/uwp/api/windows.perception.people. eyespose?view=winrt-19041 (accessed on 17 November 2020).
26. Microsoft. Eye Tracking on HoloLens 2. Available online: https://docs.microsoft.com/en-us/windows/mixed-reality/design/ eye-tracking (accessed on 12 November 2020).
27. Microsoft. Create Mixed Reality Photos and Videos. Available online: https://docs.microsoft.com/en-us/hololens/holographicphotos-and-videos (accessed on 13 November 2020).
28. Kassner, M.; Patera, W.; Bulling, A. Pupil: An Open Source Platform for Pervasive Eye Tracking and Mobile Gaze-based Interaction. In Proceedings of the 2014 ACM International Joint Conference on Pervasive and Ubiquitous Computing Adjunct Publication— UbiComp 14 Adjunct; Brush, A.J., Friday, A., Kientz, J., Scott, J., Song, J., Eds.; ACM Press: New York, NY, USA, 2014; pp. 11511160. [CrossRef]
29. Dink, J.; Ferguson, B. eyetrackingR: An R Library for Eye-tracking Data Analysis. Available online: http://www.eyetracking-r. com/ (accessed on 24 November 2020).
30. Zhegallo, A.V.; Marmalyuk, P.A. ETRANR Extension Package for Eye Tracking Results Analysis. Perception 2015, 44, 11291135. [CrossRef] [PubMed]
31. Olsen, A. The Tobii I-VT Fixation Filter: Algorithm description. Available online: https://www.tobiipro.com/siteassets/tobiipro/learn-and-support/analyze/how-do-we-classify-eye-movements/tobii-pro-i-vt-fixation-filter.pdf/?v=2012 (accessed on 12 November 2020).
32. Llanes-Jurado, J.; Marín-Morales, J.; Guixeres, J.; Alcañiz, M. Development and Calibration of an Eye-Tracking Fixation Identification Algorithm for Immersive Virtual Reality. Sensors 2020, 20, 4956. [CrossRef] [PubMed]
33. Salvucci, D.D.; Goldberg, J.H. Identifying Fixations and Saccades in Eye-Tracking Protocols. In Proceedings of the Eye Tracking Research & Applications Symposium 2000 Palm Beach Gardens, FL, November 68, 2000; Association for Computing Machinery: New York, NY, USA, 2000. [CrossRef]
34. Feit, A.M.; Williams, S.; Toledo, A.; Paradiso, A.; Kulkarni, H.; Kane, S.; Morris, M.R. Toward Everyday Gaze Input. In Proceedings of the 2017 CHI Conference on Human Factors in Computing Systems; Mark, G., Fussell, S., Lampe, C., Schraefel, M., Hourcade, J.P., Appert, C., Wigdor, D., Eds.; ACM: New York, NY, USA, 2017; pp. 11181130. [CrossRef]
35. Steil, J.; Huang, M.X.; Bulling, A. Fixation detection for head-mounted eye tracking based on visual similarity of gaze targets. In Eye Tracking Research and Applications Symposium (ETRA); Association for Computing Machinery: New York, NY, USA, 2018; pp. 19. [CrossRef]
36. Duchowski, A.; Medlin, E.; Cournia, N.; Murphy, H.; Gramopadhye, A.; Nair, S.; Vorah, J.; Melloy, B. 3-D eye movement analysis. Behav. Res. Methods Instrum. Comput. 2002, 34, 573591. [CrossRef] [PubMed]
37. Strzys, M.P.; Kapp, S.; Thees, M.; Kuhn, J.; Lukowicz, P.; Knierim, P.; Schmidt, A. Augmenting the thermal flux experiment: A mixed reality approach with the HoloLens. Phys. Teach. 2017, 55, 376377. [CrossRef]
38. Kapp, S.; Thees, M.; Strzys, M.P.; Beil, F.; Kuhn, J.; Amiraslanov, O.; Javaheri, H.; Lukowicz, P.; Lauer, F.; Rheinländer, C.; et al. Augmenting Kirchhoffs laws: Using augmented reality and smartglasses to enhance conceptual electrical experiments for high school students. Phys. Teach. 2019, 57, 5253. [CrossRef]
39. Orlosky, J.; Toyama, T.; Sonntag, D.; Kiyokawa, K. Using Eye-Gaze and Visualization to Augment Memory. In Distributed, Ambient, and Pervasive Interactions; Streitz, N., Markopoulos, P., Eds.; Springer International Publishing: Cham, Switzerland, 2014; Volume 8530 LNCS, pp. 282291. [CrossRef]
40. Toyama, T.; Sonntag, D.; Orlosky, J.; Kiyokawa, K. Attention Engagement and Cognitive State Analysis for Augmented Reality Text Display Functions. In Proceedings of the 20th International Conference on Intelligent User Interfaces—IUI 15; ACM Press: New York, NY, USA, 2015; pp. 322332. [CrossRef]
Sensors 2021, 21, 2234
18 of 18
41. Toyama, T.; Orlosky, J.; Sonntag, D.; Kiyokawa, K. A Natural Interface for Multi-Focal Plane Head Mounted Displays Using 3D Gaze. In Proceedings of the 2014 International Working Conference on Advanced Visual Interfaces; Association for Computing Machinery: New York, NY, USA, 2014; pp. 2532. [CrossRef]
42. van der Meulen, H.; Kun, A.L.; Shaer, O. What Are We Missing? In ISS 17: Proceedings of the 2017 ACM International Conference on Interactive Surfaces and Spaces; Association for Computing Machinery: New York, NY, USA, 2017; pp. 396400. [CrossRef]
43. Kytö, M.; Ens, B.; Piumsomboon, T.; Lee, G.A.; Billinghurst, M. Pinpointing. In Proceedings of the 2018 CHI Conference on Human Factors in Computing Systems—CHI 18; Mandryk, R., Hancock, M., Perry, M., Cox, A., Eds.; ACM Press: New York, NY, USA, 2018; pp. 114. [CrossRef]
44. Barz, M.; Kapp, S.; Kuhn, J.; Sonntag, D. Automatic Recognition and Augmentation of Attended Objects in Real-time using Eye Tracking and a Head-mounted Display. Manuscript submitted for publication.
45. Cerrolaza, J.J.; Villanueva, A.; Villanueva, M.; Cabeza, R. Error characterization and compensation in eye tracking systems. In Proceedings of the Symposium on Eye Tracking Research and Applications; ACM: New York, NY, USA, 2012; pp. 205208. [CrossRef]
46. Microsoft. Comfort. Available online: https://docs.microsoft.com/de-de/windows/mixed-reality/design/comfort (accessed on 25 November 2020).
47. Kramida, G. Resolving the Vergence-Accommodation Conflict in Head-Mounted Displays. IEEE Trans. Vis. Comput. Graph. 2016, 22, 19121931. [CrossRef] [PubMed]
48. Macinnes, J.J.; Iqbal, S.; Pearson, J.; Johnson, E.N. Wearable Eye-tracking for Research: Automated dynamic gaze mapping and accuracy/precision comparisons across devices. bioRxiv 2018. [CrossRef]

View File

@@ -0,0 +1,16 @@
Title: ARETT: Augmented Reality Eye Tracking Toolkit for Head Mounted Displays
Subject: Currently an increasing number of head mounted displays (HMD) for virtual and augmented reality (VR/AR) are equipped with integrated eye trackers. Use cases of these integrated eye trackers include rendering optimization and gaze-based user interaction. In addition, visual attention in VR and AR is interesting for applied research based on eye tracking in cognitive or educational sciences for example. While some research toolkits for VR already exist, only a few target AR scenarios. In this work, we present an open-source eye tracking toolkit for reliable gaze data acquisition in AR based on Unity 3D and the Microsoft HoloLens 2, as well as an R package for seamless data analysis. Furthermore, we evaluate the spatial accuracy and precision of the integrated eye tracker for fixation targets with different distances and angles to the user (n=21). On average, we found that gaze estimates are reported with an angular accuracy of 0.83 degrees and a precision of 0.27 degrees while the user is resting, which is on par with state-of-the-art mobile eye trackers.
Keywords: augmented reality; eye tracking; toolkit; accuracy; precision
Author: Sebastian Kapp, Michael Barz, Sergey Mukhametov, Daniel Sonntag, Jochen Kuhn
Creator: LaTeX with hyperref
Producer: pdfTeX-1.40.21
CreationDate: 03/23/21 17:08:12
ModDate: 03/23/21 18:16:12
Tagged: no
Form: none
Pages: 18
Encrypted: no
Page size: 595.276 x 841.89 pts (A4) (rotated 0 degrees)
File size: 2237439 bytes
Optimized: no
PDF version: 1.7

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,16 @@
Title: LNCS 7700 - Neural Networks: Tricks of the Trade
Subject: Neural Networks: Tricks of the Trade
Keywords:
Author: Grégoire Montavon, Geneviève B. Orr, and Klaus-Robert Müller (eds.)
Creator: gnuplot 4.2 patchlevel 2
Producer: Acrobat Distiller 10.0.0 (Windows)
CreationDate: 11/10/12 15:07:30
ModDate: 11/14/12 16:54:53
Tagged: no
Form: AcroForm
Pages: 753
Encrypted: no
Page size: 439.363 x 666.131 pts (rotated 0 degrees)
File size: 12243176 bytes
Optimized: yes
PDF version: 1.6

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,189 @@
Inhaltsverzeichnis
1
Die Sinne - unsere Fenster zur Welt..........................
1
1.1 Wahrnehmung findet im Gehirn statt........................................................................ 2
1.1.1 Gefangen in der Maskenwelt......................................................................................... 2
1.1.2 Das Gehirn, das rätselhafte Organ der Wahrnehmung................................................ 2
1.2 Wie kommt die Welt in unseren Kopf?......................................................................... 4
1.2.1 Von der Sinneszelle zur Wahrnehmung........................................................................ 4
1.2.2 Wahrnehmung ist ein Urteilsakt des Gehirns............................................................... 5
1.3 Sinneswelten................................................................................................................. 6
1.3.1 Sinneswelt, die erste!.................................................................................................... 6
1.3.2 Sinneswelt, die zweite!................................................................................................. 7
1.3.3 Sinneswelt, die dritte!.................................................................................................. 9
1.4 Vom Sinn der Sinne....................................................................................................... 9
2 2.1 2.1.1 2.2 2.2.1 2.2.2 2.2.3 2.2.4
2.2.5 2.2.6 2.3
2.3.1 2.3.2
Die Evolution der Sinne................................................................................... 11 Die Sinne des Menschen und wie er dazu kam.......................................................... 12 Wie viele Sinne hat der Mensch?.................................................................................. 12 Die Evolution der Sinne............................................................................................... 14 Die Evolution ist der Motor für die Weiterentwicklung des Lebens.............................. 14 Das Prinzip der Zucht - die künstliche Auswahl.......................................................... 16 Das Prinzip der Evolution - die natürliche Auslese....................................................... 17 Die Eigenschaften unserer Sinnessysteme und die Verarbeitungsstrategien unseres Gehirns sind ein Produkt der Evolution.......................................................... 20 Kinder der Evolution...................................................................................................... 24 „Wer hat's erfunden?".................................................................................................... 26 Jeder auf seine Art - die Leistungen unserer Sinne sind höchst unterschiedlich............................................................................................................ 27 Zwei Sinne im Vergleich............................................................................................... 27 Vom Sinnesreiz zum Verhalten..................................................................................... 29 Weiterführende Literatur............................................................................................. 31
3 3.1 3.2 3.2.1 3.2.2 3.2.3 3.2.4 3.3
3.3.1 3.3.2
3.3.3 3.4
Die Sprache der Nervenzellen - und wie man sie versteht....................... 33 Labor eines Neurowissenschaftlers............................................................................. 35 Labor 1: Die wunderbare Welt der Nervenzelle......................................................... 37 Nervenzellen sind die Funktionseinheiten des Gehirns............................................... 37 Aufbau einer Nervenzelle............................................................................................. 37 Was macht die Nervenzelle zur Nervenzelle?............................................................... 40 Warum können Nervenzellen Signale übertragen?..................................................... 44 Labor 2: Von Ionen und Membranen - wie Nervenzellen eine elektrische Spannung aufbauen.................................................................................................... 45 Ionen sind die Grundlage für elektrische Signale inNervenzellen............................... 45 lonenpumpen bauen Unterschiede zwischen dem Inneren der Zelle und ihrer Umgebung auf.............................................................................................................. 47 lonenkanäle sind elektrische Schalter in der Zellmembran.......................................... 48 Labor 3: Aktionspotenziale sind die Sprache unseres Nervensystems.................... 51
VI
Inhaltsverzeichnis
3.4.1 3.4.2 3.5 3.5.1 3.6 3.6.1
3.6.2 3.6.3
Die Membranspannung spiegelt die Aktivität einer Nervenzelle wider........................ 51
Aktionspotenziale leiten Signale über lange Strecken................................................. 52
Labor 4: Wie Nervenzellen Information austauschen................................................. 58
Synapsen übertragen die Information chemisch......................................................... 58
Labor 5: Wie man mit Nervenzellen einen Hochleistungsrechner baut.................... 61
Die Grundlagen des neuronalen Rechnens: Konvergenz und Divergenz,
Erregung und Hemmung.............................................................................................. 61
Der Rechner in der Nervenzelle....................................................................
64
Die schreckhafte Maus oder die Rückwärtshemmung alsNotbremse.......................... 66
Weiterführende Literatur.............................................................................................. 67
4 4.1 4.1.1 4.1.2 4.1.3
4.1.4
4.2 4.2.1 4.3 4.3.1
4.4 4.4.1 4.4.2 4.4.3
Von der Sinneszelle zum Gehirn..................................................................... 69 Vom Reiz zum elektrischen Signal - die Signalwandlung.......................................... 70 Eine komplizierte Aufgabe............................................................................................ 70 Sinneszellen besitzen ein spezialisiertes Außensegment............................................. 70 Die einfachste Art der Signalwandlung: Rezeptor und lonenkanal sind in einem Protein zusammengefasst.................................................................................. 71 Signalwandlung mit dem Baukastensystem - die G-Protein-gekoppelte Signalkaskade................................................................................................................. 72 Adaptation.................................................................................................................... 76 Sinneszellen passen sich an die Umgebung an - sie adaptieren.................................. 76 Codierung der Sinnesinformation............................................................................... 77 Sinnesreize werden in der Abfolge von Aktionspotenzialen codiert und an das Gehirn geschickt..................................................................................................... 77 Die geordnete Verschaltung der Sinnesinformation................................................. 78 Ordnung im Strom der Sinnesinformation................................................................... 78 Ordnung auf höchster Ebene - die topografische Abbildung..................................... 81 Die Sinnesinformation wird gefiltert............................................................................. 81 Weiterführende Literatur.............................................................................................. 82
5
Schmecken......................................................................................................... 83
5.1 Vom Sinn des Schmeckens........................................................................................... 84
5.2 Geschmackszellen überprüfen die Nahrung.............................................................. 86
5.3 Sauer und salzig: lonenkanäle auf der Zunge............................................................ 86
5.4 Bittere Gifte.................................................................................................................. 91
5.5 Köstlicher Geschmack: Süß und umami....................................................................... 94
5.6 Der „Scharfgeschmack" ist eigentlich ein Schmerzreiz............................................. 96
5.7 Die Geschmacksempfindung....................................................................................... 98
5.8 Andere Lösungen.......................................................................................................... 99
Weiterführende Literatur................................................................................................100
6
Riechen................................................................................................................. 101
6.1 Die Vielfalt der Gerüche ist grenzenlos........................................................................ 102
6.2 Riechzellen in der Nase delektieren Duftstoffe........................................................... 103
6.3 Im Gehirn entstehen Geruchsbilder..............................................................................110
6.4 Bleib jung! Das Riechsystem erneuert sich selbst........................................................112
6.5 Das Riechen mit Zilien................................................................................................... 112
VII Inhaltsverzeichnis
6.6 Pheromone organisieren das Sozialleben...................................................................114 6.7 Was uns an Gerüchen interessiert.................................................................................119 6.8 Leben, ohne zu riechen................................................................................................. 123
Weiterführende Literatur................................................................................................123
7 Sehen.....................................................................................................................125 7.1 Augen auf - und dann?.................................................................................................128 7.1.1 Ball, Satz und Sieg!......................................................................................................... 128 7.1.2 Betrachten wir die Sache mit dem Sehen mal bei Licht.................................................128 7.1.3 Was wir in diesem Kapitel sehen werden........................................................................130 7.1.4 Was ist eigentlich Licht?................................................................................................. 131 7.2 Das Auge..........................................................................................................................132 7.2.1 „Ich seh dir in die Augen, Kleines!"..................................................................................132 7.2.2 Auf den ersten Blick ähnelt unser Auge einer Kamera....................................................134 7.2.3 Nur im winzigen Zentrum unseres Bildfeldes sehen wir wirklich scharf........................137 7.2.4 Die Verteilung der Photorezeptoren erfolgt als Anpassung an die Lebensweise........ 142 7.2.5 Wer hat die schärfsten Augen?.......................................................................................143 7.3 Wie unsere Photorezeptoren Licht in die Sprache des Nervensystems
übersetzen - die Phototransduktion............................................................................146 7.3.1 Das Außensegment ist die lichtempfindliche Antenne des Photorezeptors...............146 7.3.2 Der erste Schritt beim Sehen: Ein Farbstoffmolekül im Photorezeptor absorbiert
das Lichtquant................................................................................................................148 7.3.3 Die elektrische Lichtantwort unserer Photorezeptoren ist außergewöhnlich............. 149 7.3.4 Unsere Photorezeptoren - die etwas anderen Zellen....................................................151 7.3.5 Ein Stäbchen kann zwar auf ein Lichtquant reagieren, wahrnehmen können
wir ein einzelnes Lichtquant aber nicht..........................................................................154 7.3.6 Besser als jeder fotografische Film: Die Anpassungsleistung der Netzhaut...................154 7.3.7 Immer in Bewegung bleiben - wie Mikrosakkaden unsere Wahrnehmung
stabilisieren.....................................................................................................................157 7.4 Farbensehen...................................................................................................................158 7.4.1 Drei Sehpigmente in den Zapfen ermöglichen uns das Farbensehen........................... 158 7.4.2 Die trichromatische Theorie der Farbwahrnehmung......................................................162 7.4.3 Farbsehstörungen...........................................................................................................162 7.4.4 Die Evolution des Farbensehens......................................................................................163 7.5 Die Retina - der Rechner im Auge................................................................................. 166 7.5.1 Die Netzhaut besteht nicht nur aus Photorezeptoren.................................................... 166 7.5.2 Die Information wird im retinalen Netzwerk weiterverarbeitet..................................... 167 7.5.3 Die Sprache der Ganglienzellen......................................................................................169 7.5.4 Vorteil eins: Objekttrennung durch Kontrastverschärfung!............................................171 7.5.5 Vorteil zwei: Die Informationsflut wird reduziert............................................................ 173 7.5.6 Vorteil drei: Unabhängig werden von der Beleuchtung.................................................174 7.5.7 Wie die Antwort im Zentrum des rezeptiven Feldes erzeugt wird................................ 178 7.5.8 Wie die Retina durch laterale Hemmung rezeptive Felder erzeugt.............................. 178 7.5.9 Ganglienzellen sind neuronale Filter...............................................................................180 7.5.10 Auf ins Gehirn!.................................................................................................................184 7.6 Eine Reise durch das Sehsystem.....................................................................................184 7.6.1 Von der Retina bis zur primären Sehrinde.......................................................................184 7.6.2 Die Sehrinde ist hochorganisiert....................................................................................187
VIII Inhaltsverzeichnis
7.6.3
7.6.4 7.6.5 7.6.6 7.6.7 7.6.8
Die meisten rezeptiven Felder in der primären Sehrinde reagieren auf Kanten und Linien........................................................................................................................188 Jenseits der primären Sehrinde......................................................................................192 Der dorsale Pfad: Die Wo-wie-wohin-Bahn.....................................................................193 Der ventrale Pfad: die Was-Bahn......................................................................................194 Wo, bitte, geht's zur Großmutterzelle?............................................................................196 Andere Lösungen: Komplexaugen...................................................................................198 Weiterführende Literatur................................................................................................201
8 8.1 8.2 8.2.1 8.2.2 8.3 8.3.1 8.3.2 8.3.3 8.3.4 8.3.5 8.3.6 8.4 8.4.1 8.4.2 8.4.3 8.5 8.5.1 8.5.2 8.5.3 8.6
Hören.................................................................................................................... 203 Bei Nacht im Kreidewald................................................................................................204 Schall hören....................................................................................................................205 Von der Schallquelle in das Ohr......................................................................................205 Die Vielfalt des Hörens: Töne, Klänge, Geräusche...........................................................207 Cochlea - die tonotope Hörschnecke............................................................................ 208 Resonanz und Wanderwellen.........................................................................................208 Aufbau der Cochlea........................................................................................................209 Der Verstärker des Corti-Organs......................................................................................211 Innere Haarzellen - empfindlicher geht es nicht.............................................................213 Die mechanoelektrische Transduktion............................................................................217 Haarzellen übertragen ihr Signal auf Nervenfasern........................................................219 Unsere Hörwelt...............................................................................................................221 Schallortung....................................................................................................................221 Die Wahrnehmung von Sprache......................................................................................225 Musik - der direkte Weg zur Emotion............................................................................. 229 Die Hörwelt der anderen: Echoortung..........................................................................232 „Sehen mit den Ohren"...................................................................................................232 Die Kunst der Echoortung..............................................................................................234 Angewandte Physik - die Fledermaus nutzt den Dopplereffekt................................... 238 Andere Lösungen: Mit den Knochen hören..................................................................240 Weiterführende Literatur................................................................................................243
9 9.1 9.2 9.3 9.3.1 9.3.2 9.4 9.4.1 9.4.2
Orientierung und Navigation............................................................................245 Wo bin ich?......................................................................................................................246 Die Orientierung an chemischen Signalen................................................................... 247 Visuelle Orientierung.....................................................................................................250 Sonne und Polarstern dienen als Orientierungshilfe......................................................250 Die Detektion von polarisiertem Licht............................................................................252 Der magnetische Kartensinn.........................................................................................255 Das Magnetfeld der Erde................................................................................................. 255 Magnetsinn bei Vögeln...................................................................................................259 Weiterführende Literatur................................................................................................266
10 Tasten und Fühlen........................................................................................................267 10.1 Unsere Haut....................................................................................................................268 10.2 Tasthaare.........................................................................................................................271 10.3 Schmerz - Warnung und Leid........................................................................................273
IX Inhaltsverzeichnis
10.4 Kälte, Wärme, Infrarot....................................................................................................283 Weiterführende Literatur...............................................................................................287
11
Unsere Innenwelt...............................................................................................289
11.1 Regelkreise organisieren den Körper...........................................................................290
11.2 Muskelspindeln..............................................................................................................292
11.3 Der Gleichgewichtssinn................................................................................................ 293
11.4 Ausleuchtung der Innenwelt: Die Endorezeptoren.....................................................296
Weiterführende Literatur...............................................................................................299
12 Wahrnehmung....................................................................................................301 12.1 Was ist Wahrnehmung?.................................................................................................303 12.1.1 Der erste Schritt: Wahrnehmung ist indirekt - unser Gehirn muss die Umwelt
deshalb rekonstruieren...................................................................................................304 12.1.2 Der zweite Schritt zur Wahrnehmung: Die Rekonstruktion unserer Umwelt
erfolgt nicht „wertfrei" - unser Gehirn stellt eine Hypothese über die Umwelt auf.... 305 12.2 Prinzipien der Objekterkennung................................................................................. 308 12.2.1 Das Gehirn nutzt zur Wahrnehmung von Objekten einfache Prinzipien....................... 308 12.3 Trennung von Objekt und Hintergrund........................................................................313 12.3.1 Unser Gehirn „übertreibt" beim Trennen von Objekt und Hintergrund..........................313 12.3.2 Wettstreit der Strategien.................................................................................................314 12.3.3 Scheinkonturen - wir sehen etwas, das gar nicht ist......................................................315 12.4 Wahrnehmung von Bewegung.................................................................................... 316 12.4.1 Bewegung ist einer der wichtigsten Parameter in einer belebten Umwelt.................... 316 12.4.2 Wer bewegt sich - du oder ich?.....................................................................................317 12.5 Wahrnehmung von Tiefe............................................................................................... 320 12.5.1 Wie erzeugt unser Gehirn eine dreidimensionale Wahrnehmung aus einem
zweidimensionalen Retinabild?..................................................................................... 320 12.5.2 Auch ein zweidimensionales Bild kann Tiefeninformation enthalten............................ 320 12.5.3 Erst das Sehen mit zwei Augen erlaubt die optimale Tiefenwahrnehmung.................. 321 12.5.4 Die Wunderwelt des Stereogramms.............................................................................. 323 12.5.5 Zufallspunktbilder - Tiefe aus dem Rauschen...............................................................325 12.5.6 Das Pulfrich-Pendel - oder: Täuschung ist die Wahrnehmung einer falschen
Hypothese.......................................................................................................................326 12.6 Wahrnehmung von Größe............................................................................................329 12.6.1 Das Prinzip der Größenkonstanz - damit aus Riesen keine Zwerge werden.............. 329 12.6.2 Wenn Kugeln wachsen und schrumpfen - Größenkonstanz beim Pulfrich-Pendel.... 332 12.7 Wettstreit der Sinne, Körpertausch, Magie und andere Illusionen............................333 12.7.1 Das Gehirn sucht aktiv nach Information........................................................................333 12.7.2 Wahrnehmung ist ein Erinnerungsprozess.................................................................... 334 12.7.3 Zur lückenlosen, geordneten Wahrnehmung muss das Gehirn unser
Zeitempfinden bei der Wahrnehmung manipulieren....................................................335 12.7.4 Unser Gedächtnis ist die tragende Säule unserer Wahrnehmung................................. 336 12.7.5 „Blinde hören besser als Sehende" - Mythos oder Wirklichkeit?................................... 339 12.7.6 Ist die Wahrnehmung des eigenen Körpers auch nur ein Konstrukt
unseres Gehirns?.............................................................................................................340 12.7.7 Wahrnehmung ist abhängig von unserer Aufmerksamkeit........................................... 341
X
Inhaltsverzeichnis
12.7.8 Selektive Aufmerksamkeit führt zur Blindheit für andere Reize...................................343 12.7.9 Aufmerksamkeit verändert die Physiologie des Gehirns.............................................344 12.7.10 Wahrnehmungsexperten der besonderen Art.............................................................. 346 12.7.11 Im Gleichschritt zur Wahrnehmung............................................................................... 347 12.7.12 Was wir von Patienten mit Wahrnehmungsstörungenlernen können..........................349
Weiterführende Literatur.............................................................................................. 351
13
Anhang...............................................................................................................353
13.1 Herstellung von Masken..............................................................................................354
13.2 Die versteckte Maus....................................................................................................354
13.3 Die Täuschung nach Koffka......................................................................................... 355
13.4 Suchbilder......................................................................................................................355
13.5 Gedankenlesen aus der Ferne......................................................................................355
Serviceteil Glossar............................................................................................................................362 Stichwortverzeichnis......................................................................................................371

View File

@@ -0,0 +1,11 @@
Producer: ABBYY FineReader Server; modified using iTextSharpTM 5.5.13 ©2000-2018 iText Group NV (AGPL-version)
CreationDate: 07/23/19 14:29:53
ModDate: 07/23/19 16:36:07
Tagged: no
Form: none
Pages: 6
Encrypted: no
Page size: 453.5 x 680.3 pts (rotated 0 degrees)
File size: 396913 bytes
Optimized: no
PDF version: 1.4

View File

@@ -0,0 +1,8 @@
Institute of Mathematical Statistics is collaborating with JSTOR to digitize, preserve, and extend access to
The Annals of Mathematical Statistics.
®
www.jstor.org

View File

@@ -0,0 +1,11 @@
Creator: page2pdf
Producer: PDFlib 3.02 (SunOS 5.6)
CreationDate: 01/20/06 13:52:38
Tagged: no
Form: none
Pages: 11
Encrypted: no
Page size: 483.8 x 800.9 pts (rotated 0 degrees)
File size: 779952 bytes
Optimized: no
PDF version: 1.3

View File

@@ -0,0 +1,323 @@
See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/256528284
Performance Comparison: Optical and Magnetic Head Tracking
Article · May 2013
CITATIONS
4
4 authors, including:
Naresh Kumar Panjab University 91 PUBLICATIONS 907 CITATIONS
SEE PROFILE
READS
1,715
Vinod Karar Central Road Research Institute 180 PUBLICATIONS 1,302 CITATIONS
SEE PROFILE
All content following this page was uploaded by Naresh Kumar on 03 June 2014.
The user has requested enhancement of the downloaded file.
International Journal of IT, Engineering and Applied Sciences Research (IJIEASR) ISSN: 2319-4413
27
Volume 2, No. 3, March 2013
Performance Comparison: Optical and Magnetic Head Tracking
Harjot Singh, Student, M.E (Electronics & Communication Engineering), University Institute of Engineering & Technology (UIET), Panjab University, Chandigarh, India Vinod Karar, CSIR-Central Scientific Instruments Organisation, Chandigarh Naresh Kumar, Assistant Professor (ECE), University Institute of Engineering & Technology (UIET), Panjab University, Chandigarh, India Surender Singh Saini, CSIR-Central Scientific Instruments Organisation, Chandigarh
ABSTRACT
Tracking also called Position and Orientation Tracking or Position Tracking and mapping, is used in VEs where the orientation and the position of a real physical object is required. Specifying a point in 3-D requires the transition position, that is the Cartesian coordinates x, y, and z. However, many VE applications manipulate entire objects and this requires the orientation to be specified by three angles known as pitch (elevation), roll, and yaw (azimuth). Thus, six degrees of freedom (DOF) are the minimum required to fully describe the position of an object in 3-D. Head tracking is basically related to the head movements and is used for updating the head moves. They provide accurate provide information to the flight computer about the orientation of the head of the pilot with high degree of accuracy and extremely low impact on helmet mounted display (HMD) weight, size, and packaging. This paper compares the performance of head tracking utilizing optical and magnetic tracking techniques.
Keywords:
Head tracker, field of view (FOV), Optical tracker, Electro-magnetic tracker, helmet mounted display, smoother
METHODS AND MATERIALS
The head-tracking process of determining the users head position, relaying this position to the sensor, the sensors movement to the correct line-of-sight, the sensors acquisition of the scene, and transmitting and presenting the final imagery on the HMD takes time [2].
One potential problem is that the time required measuring the head movements with the head tracking system and update the display creates a temporal lag that impairs perception and places constraints on the gain of the head tracking system. It is therefore important to investigate the relationships among head-tracking accuracy, lag, and size of head movement to determine an acceptable set of parameters [3].
Head tracking system tracks and calculates the head movements and updates the head moves. Head tracker system measure 3 degree of freedom, six degree of freedom measurement of position (X, Y and Z coordinates) and orientation (azimuth, elevation and roll). The head tracking system measures head movements in the range of 180° for angular azimuth, 130° for elevation, and 120° for roll, with an accuracy of about 1-2mR on bore-sight and 2-6 mR at 10° eccentricity, and linear displacements in the vertical of order of 450mm, 400mm in horizontal and 540mm in fore/aft direction[1].
The earliest head tracking displays used infrared light reflected from the viewers head. In a Fresnel lens single viewer display [4], the head detector moves with the lens. Illumination of each side of the viewers head by two wavelength bands of infrared light has also been applied [5]. A further display use a large format convex lens to illuminate the left side of the viewers head with IR in the 830-870 nm range , and the right side of the head in 930970 nm band. The outputs of a pair of cameras with filters are used to control the illumination from a monochrome 2-D display without the use of any additional processing. This method is used in other display by the same researchers [6-7]. Real-time position and orientation tracking of viewer is an important for 3-D displays [89]. For some display or application, only orientation or position can be tracked. This imposes many limitations bur also simplifies the task significantly. There already exist small, cheap and accurate inertial sensors which can be attached to HMDs. The accuracies of static position and orientation and dynamic movement measurements are important. Also features like sample rate, number of targets tracked, range of tracking, latency, update rate, registration, and space requirements may be important. It is highly important that the viewer is tracked and the scene gets update very fast. Many head tracking methods are available including electromechanical, electromagnetic, acoustic, inertial and optical tracking. The characteristics of head trackers as resolution, accuracy and system responsiveness [10] are given below:
Resolution: Measures the exactness with which a system can locate a reported position. It is measured in the terms
i-Xplore International Research Journal Consortium
www.irjcjournals.org
International Journal of IT, Engineering and Applied Sciences Research (IJIEASR) ISSN: 2319-4413
28
Volume 2, No. 3, March 2013
of inch per inch of transmitter and receiver separation for position, and degrees for orientation.
Accuracy: the range with in which a reported position is correct. This is a function of the error involved in making measurements and often it is expressed in statistical error terminology as degrees root mean square (RMS) for orientation and inches RMS for position.
System responsiveness comprises: Sampling Rate-The rate at which sensors are checked for data, usually expressed as frequency.
Update Rate: The rate at which the system reports new position coordinates to the host computer, also usually given as frequency.
Data Rate: the number of computed position per second usually expressed as frequency.
Latency: It is also known as lag. The delay between the movements of the remotely sensed object and the report of new position. This is measured in milliseconds (ms). These characteristics provide some guidance for tracker performance. One of the most important is latency. Delays greater than 60 msec between head position and visual feedback impair adaptation and the illusion of presence [11]. Latencies of greater than 10 msec may contribute to simulator sickness Bryson [12] consider system with latency longer than 0.5 seconds not to be real-time interactive.
Head tracking system consists of four major assemblies: face detector, tracking mechanism, smoother and head position calculation. Face detector is used for detecting the face for all positions even if the head is tilted or turned slightly away from the camera with a fast and precise detection algorithm. The object tracker algorithm is used for object tracking and position called as camshift. The smoother calculates the current position by weighting average of the previous position and not the current one. For the head position calculation we need to know field of view of the camera. So head tracking is a special section of tracking. Since head have only one position and tracked only one position. This simplifies the task because enough focus on only one point [13]. A typical head tracking system process is given in figure 1.
Figure 1: Head Tracking System Process
OPTICAL HEAD TRACKING
Optical tracker employs inferred emitter on the helmet to measure the pilot head position. An optical head tracking system consists of three subsystems. The optical image system, the mechanical tracking platform and tracking computer.
Optical tracking imaging: It converts the light source into digital image. Depending upon the design it is very from simple standard digital camera to an astronomical telescope on the top of a mountain.
Mechanical tracking: It holds the optical imaging system and manipulating the optical imaging system in such a way it always point the target being tracked.
Tracking computer: It capturing the images from optical imaging system analysing the image to extract target position and controlling the mechanical tracking platform to follow the target. First the tracking computer has to be able to capture the image at a relatively high frame rate. This posts a requirement on the bandwidth of the image capturing hardware. The second challenge is that the image processing software has to be able to extract the target image from its background and calculate its position.
Disadvantages of optical system are sensitivity to sunlight and other heat source. MIG-29/AA-11 Archer system use optical tracking [20]. The optical head trackers require direct line-of-sight and large field of view (FOV) [1].Optical tracker have less temporal lag than magnetic tracker. The magnetic may be affected by metal object and magnetic radiation [14]. Optical head tracking system is compacter and lighter than magnetic .Optical tracker operate using remote measurement by camera of position in space of LED mounted on the helmet [16].
CHARACTERISTICS OF DIFFERENT TYPES OPTICAL HEAD TRACKER USED TILL NOW
Selcom AB, SELSPOT II: SELSPOT II is a commercial tracking system marketed by Selcom AB, a Swedish company. A camera registers light pulses from LEDs attached to the object being tracked. Located between the lens and electronics of the camera is the SELSPOT sensor, a patented Photo detector made by SiTek Laboratories and consisting of a flat semi-conductor disc. Each side of the diode has a light-sensitive coating to produce a high resolution, two-axis field. When a light pulse from one of the LEDs passes the lens system in the camera and strikes a point within this field, the electronics registers the x and y coordinates in the two axis field. Two or more cameras
i-Xplore International Research Journal Consortium
www.irjcjournals.org
International Journal of IT, Engineering and Applied Sciences Research (IJIEASR) ISSN: 2319-4413
29
Volume 2, No. 3, March 2013
are required to analyze movements in three dimensions. Sampling rate is very high 10 KHz.
Optotrak 3020: The OPTOTRAK 3020 by Northern Digital Inc. is an infra-red (IR)-based, noncontact position and motion measurement system. Small IR LEDs (markers) attached to a subject are tracked by a number of custom designed sensors. The 3-D positions of the markers are determined in real-time or post hoc, up to 256 markers can be tracked. The position sensor consists of three 1-D charged coupled device (CCD) sensors paired with three lens cells and mounted in a 1.1m long stabilized bar. Within each of the three lens cells, light from the LED is directed onto a CCD and measured. All three measurements together determine the 3-D location of the marker, which is calculated and displayed in real time. Max. Data Rate 3500 Hz (raw), 600 Hz (real-time3D).
MacReflex Motion Measurement System: The MacReflex Motion Measurement System, by Qualisys, Inc. also is designed to measure the 3-D motion of subjects in real-time. The system is comprised: 1) One or more MacReflex position sensors (a 3-D system uses from two to seven position sensors). 2) Software to enable the user to set up and calibrate the field of view of the position sensors, and process the measured spatial coordinates of the target markers that are attached to the subject being tracked. 3) Passive reflective target markers, 4) A calibration frame for 3-D measurements, and 5) A Macintosh computer system. The position sensor has two components a CCD digital video camera, and a video processor. The camera views up to 20 markers in real-time. It then sends the video image to the video processor which determines the centroid of each marker and determines its x, y coordinates. A program converts the x, y coordinates to enable calculation of position, displacement, velocity, acceleration, angles, angular velocity, and angular acceleration. Sampling rate 50-200 Hz.
DynaSight: The Origin Instruments Corporation tracking product, DynaSight, is an electro-optical sensor with integrated signal processing that performs 3-D measurements of a passive, non-tethered target. A twocolor LED on the front of the sensor indicates the tracking status to the user. In a typical application, the sensor is mounted just above the viewable area of a real-time graphics display. The sensors field of view is a nominal 75 cone, and the sensor is pointed such that this field covers the comfortable range of head/eye positions for the user of the display. The sensor measures and reports on the 3-D movements of a tiny target that is referenced to the users forehead. The passive target itself can be mounted on eye glasses, stereoscopic goggles, or on the users forehead. Larger high-performance targets are
available that allow measurements at a sensor-to-target range of up to 20 feet. The Active Target Adapter enables tracking of up to four active targets tethered to the Adapter. Five DOF are achieved with two targets, while 6 DOF can be achieved by tracking three or four active targets. DynaSight is the first in a new line of 3-D measurement products. It is planned that future systems will offer 6 DOF for HMDs using passive sensors and multiple sensors for networked operations in large virtual volumes. Update Rate is 64 Hz and Latency 16-31 msec.
RK-447 Multiple Target Tracking System: The RK-447 Multiple Target Tracking System, by ISCAN, Inc., is a video tracking system which can track up to 64 facial points at 60 Hz with a latency of 16 msec. It is a real time digital image processor employing ISCAN's proprietary Simultaneous Multiple Area Recognition and Tracking (SMART) architecture. The ISCAN SMART processor computes the position and size of up to 256 areas that are within a particular range of intensity levels. Filtering the output of the SMART processor allows the complete system to specify targets of desired size, position, and intensity parameters from a field containing many potential targets. After positioning the imaging sensor to include the desired field of view, the image gray level corresponding to the target may be selected. The areas of the video image whose intensity is within the gray level threshold setting are presented on the monitor as a bright overlay, letting the operator see precisely the video information being processed. For each threshold area, size and position data are computed and stored in a data table which may be accessed by an external computer. The RK447 Multiple Target Tracking System divides the image signal into a 512 horizontal by 256 vertical picture element matrixes. As the targets position and size data are automatically determined over the monitor image area, the data within the azimuth and elevation coordinate table correspond to the horizontal and vertical coordinates within the video matrix. These coordinate data are updated every 16 msec and are available for input to a computer. Parametric information may be input to the RK-447 to automatically limit the data set to targets within a particular size or position range.
MAGNETIC HEAD TRACKING
Magnetic head trackers use set of coils in cockpit that produce magnetic field. Magnetic sensor (Receiver) is mounted onto the helmet which determines the strength and angle of fields [1]. The receiver senses the changes in the magnetic field caused by movement. These changes are recorded and processed by an algorithm that determines the position and orientation of the receiver in relation to the transmitter. This position and orientation data is then sent to the computer to update the virtual environment display [2]. Most military tracking systems are based on magnetic tracking and have
i-Xplore International Research Journal Consortium
www.irjcjournals.org
International Journal of IT, Engineering and Applied Sciences Research (IJIEASR) ISSN: 2319-4413
30
Volume 2, No. 3, March 2013
dedicated processors. A small antenna behind the pilots head creates a multi-component electromagnetic field of well-defined shape and strength. A sensor on the pilots helmet measures the strength and direction of the different components of the field. The dedicated processor crunches these measurements to yield the position and orientation of the pilots head. Results are accurate and very nearly immediate.
Magnetic sensors used in magnetic head tracking are differ from most other detectors in that they do not directly measure the physical property of interest. Devices that monitor properties such as temperature, pressure, strain, or flow provide an output that directly reports the desired parameter (see Figure 2). Magnetic sensors, on the other hand, detect changes, or disturbances, in magnetic fields that have been created or modified, and from them derive information on properties such as direction, presence, rotation, angle, or electrical currents. The output signal of these sensors requires some signal processing for translation into the desired parameter. Although magnetic detectors are somewhat more difficult to use, they do provide accurate and reliable data without physical contact. They can measure these properties without actual contact to the medium being measured [19].
Insidetrak: it is smaller version of the fastrak sensor. Testing found that Insidetrak sensing data is much noisier than Fastrak [19]. Update rate 60 Hz divided by total no. of receiver. Latency is 12 msec with working range up to 5 feet.
Ultratrak: It is most expensive head tracker of Polhemus.Ultratrak consists of a 486-based Motion Capture Server unit which contains 4 to 8 motion capture boards (each board can support 2 receivers), a VGA controller, external synchronization board, and communications card. Ultratrak comes in a 60 Hz version and a 120 Hz version (Ultratrak 120). Both come with the Long Ranger transmitter (optional equipment for Fastrak and Insidetrak) that allows tracking and capturing a subject in an area in excess of 700 square feet. Update rate is 60 Hz up to 8 receivers and 30 Hz up to 16 receivers. Latency is 20 msec.
Flock of Birds: Flock of Birds is a 6 DOF tracking system by Ascension Technology Corporation. It is intended for tracking human motions in character animation, biomedics, and VE applications. In particular, Flock trackers are used for head tracking in flight simulators/trainers; head, hand, and body tracking in VE games; and full body tracking for character animation, performance animation, virtual walkthroughs, and sports analysis. Flock of Birds has full 360° coverage without blocking or echoing problems and a fast measurement rate-up to 144 position and orientation measurements per second. Update rate is up to 144 Hz.
Fig. 2 Conventional vs. magnetic sensing
CHARACTERISTICS OF DIFFERENT TYPES MAGNETIC HEAD TRACKER USED TILL NOW
Fastrak: The Polhemus developed the fastrak. Fastrak accept data from up to 4 receivers and up to 8 systems can be multiplexed with 32 receivers. Update rate is 120 Hz divided by total no. of receiver. Disadvantages of Fastrak are very high cost and less working range. Latency is 4 msec in Fastrak.
PC/BIRD: PC/BIRD is a new offering from Ascension Technology Corporation that uses the same patented Pulsed-DC magnetic technology employed in the other Ascension tracking products. Intended for use with PCs, this tracker is configured as an ISA-compatible board, a receiver that can be mounted on any nonmetallic object, and either a standard or extended range transmitter. With the standard range transmitter, PC/BIRD operates with a range of 4 feet; the extended range transmitter allows a range of up to 10 feet. Measurements are made at the rate of up to 144 per sec. update rate is same as flock of birds.
RESULTS
The comparative performance is given in table 1.
Isotrak II: It is lower cost Polhemus product which slightly reduced performance from fastrak. It consists of an electronic unit, a single transmitter and 1 or 2 receiver. Isotrak II with update rate 60 Hz divided by total no. of receiver. Latency is 20 msec.
i-Xplore International Research Journal Consortium
www.irjcjournals.org
International Journal of IT, Engineering and Applied Sciences Research (IJIEASR) ISSN: 2319-4413
31
Volume 2, No. 3, March 2013
TABLE 1: COMPARATIVE PERFORMANCE OF OPTICAL
AND MAGNETIC HEAD TRACKING
REFERENCES
Optical Head Tracking Magnetic Head Tracking
It Employs infrared It Employs magnetic to create a
emitter on helmet to magnetic field; location sensor
measure the head mounted on helmet tracked
position.
through magnetic field [1].
Sensitivity to sunlight Requires precise magnetic
and other heat source; mapping of the cockpit to
Requires direct line-of- account for ferrous and
sight and large field of conductive material to reduce
view.
angular error in the
measurement [16].
Less temporal lag than Affected by metal object and
electro-magnetic tracker electro-magnetic radiation
[14].
Advantages:
High Advantages: Low cost; no drift;
resolution image of target no lighting conditions and
being tracked; High background or line of sight
availability; Can work constraints; Both wireless and
over large area; High wired models, real time
accuracy; No magnetic operations [17].
interference problem.
Disadvantages: High Disadvantages: High latencies
cost; Visible wavelengths due to filtering; Electromagnetic
are less optimal [15].
interference from radio;
Accuracy diminishes with
distance; Ferromagnetic/ metal
conductive surface cause field
distortion [18].
CONCLUSION
Head tracking system are basically used in helmetmounted displays for combat aircraft. The optical and magnetic head trackers have relative advantages and disadvantages. Optical head trackers needs direct line of sight and larger FOV as compared to magnetic tracker and hence magnetic head trackers are more suited for the military application. Magnetic tracking technology is relatively mature, has been militarized, and offer s best overall head tracking performance available at this time. It is likely to be predominant head tracking technique for the next generation of military head coupled system. In this given the characteristics performance of different type of trackers. The optical tracker are described in this paper are SELSPOT II, Optical 3020, Mac Reflex Motion, DynaSight and RK-447 multiple target tracking system and Magnetic tracker are PC/BIRD, Space Pad, flock of birds, Ultratrak, Insidertrak, Isotrak II and fastrak. Different types of tracker have their own advantages and disadvantages. At the end we give the comparison of optical and magnetic head tracking.
[1] Velger, M. (1998). Helmet-mounted displays and
sights. Boston, MA: Artech House.
[2] Rash, C.E. (Editor) (2000). Helmet-mounted
displays: Design issues for the rotary-wing
environment. Bellingham, WA: SPIE Press
[3]
http://www.dtic.mil/cgibin/GetTRDoc?AD=
ADA466568
[4] A. Schwartz, “Head tracking stereoscopic
display,” in Proc. IEEE Int. Displ. Res. Conf.,
1985, pp. 141144
[5] Y. Nishida, T. Hattori, S. Omori, J. Suzuki, K.
Katayama, and S. Sakuma, “Simplification of
infrared illumination of stereoscopic liquid crystal
TV,” in Proc. SPIE, Stereoscop. Displ. Virt. Real.
Syst. II, 1995, vol. 2409, pp. 96100.
[6] T. Hattori, “Stereoscopic display employing head-
position tracking using large format lenses,” in
Proc. SPIE, Stereoscop. Displ. Appl. IV, 1993, vol.
1915, pp. 25.
[7] Y. Nishida, T. Hattori, S. Sakuma, K. Katayama,
S. Omori, and T. Fukuyo, “Autostereoscopic liquid
crystal display II (practical application),”in Proc.
SPIE, Stereoscop. Displ. Virt. Real. Syst., 1994,
vol. 2177, pp. 150155.
[8] W. Barfield and T. Caudell, Eds., Fundamentals of
Wearable Computers and Augmented Reality.
Mahwah, NJ: Lawrence Erlbaum, 2001.
[9] J. P. Rolland, L. D. Davis, and Y. Baillot, “A
survey of tracking technologies for virtual
environments,” in Fundamentals of Wearable
Computers and Augmented Reality, W.
Barfieldand and T. Caudell, Eds. Mahwah, NJ:
Lawrence Erlbaum, 2001, pp. 67112.
[10] Meyer, K., Applewhite, H.L. & Biocca, F.A.
(1992). "A survey of position trackers", Presence,
1(2), 173-200, MIT Press
[11] Durlach, N.I., and A.S. Mavor, (1994) "Virtual
Reality: Scientific and Technical Challenges,"
National Academy Press. Washington D.C.
[12] Bryson, S. (1992). "Measurement and calibration
of static distortion of position data from 3D
trackers", NASA RNR Technical report RNR-92-
011
[13] Takemura K., Ido J., Matsunloto Y. and
Ogasawara T. “Drive Monitoring System Based on
Non-Contact Measurement System of Driver's
Focus of Visual Attention”, Proc. of IEEE
Intelligent Vehicles Symposium, pp.581-586, 2003.
[14] Ferrin, F. J. (1991). Survey of helmet tracking
technologies. In Proceedings of SPIE Vol. 1456:
Large Screen-Projection, Avionic, and helmet-
mounted displays, (pp. 86-94).
[15] Veis, G. (1963). "Optical tracking of artificial
satellites". Space Science Reviews 2: 250
i-Xplore International Research Journal Consortium
www.irjcjournals.org
International Journal of IT, Engineering and Applied Sciences Research (IJIEASR) ISSN: 2319-4413
32
Volume 2, No. 3, March 2013
296. Bibcode 1963SSRv....2...250V. doi:10.1007/B F00216781. http://en.wikipedia.org/wiki/Optical_motion_tracking [16] Air Power Australia. "Helmet Mounted Sights and Displays". Ausairpower.net. http://www.ausairpower.net/hmdtechnology.html. Retrieved 2010-08-20. [17] http://www.cadengineering.co.in/home6/products /3d-motion-trackers--capture-systems/electroma gnetic-3d-motion-trackers [18]http://www.hitl.washington.edu/scivw/scivwftp/publications/IDA-pdf/TRACK.PDF [19] J.E. Lenz, “A Review of Magnetic Sensors”, ProCeedings of the IEEE, vol. 78, no.6, (June 1990) 973989 [20] Garrett “Reduction of Latency in VE Applications”
i-Xplore International Research Journal Consortium
View publication stats
www.irjcjournals.org

View File

@@ -0,0 +1,13 @@
Author: lenovo 1
Creator: Microsoft® Office Word 2007
Producer: Microsoft® Office Word 2007
CreationDate: 04/15/13 22:19:05
ModDate: 04/15/13 22:19:05
Tagged: yes
Form: none
Pages: 7
Encrypted: no
Page size: 594.96 x 840.96 pts (rotated 0 degrees)
File size: 334248 bytes
Optimized: no
PDF version: 1.5

View File

@@ -0,0 +1,268 @@
PHYSICAL REVIEW PHYSICS EDUCATION RESEARCH 15, 013101 (2019)
Role of diagrams in problem solving: An evaluation of eye-tracking parameters as a measure of visual attention
Ana Susac,1,2,* Andreja Bubic,3 Maja Planinic,2 Marko Movre,2 and Marijan Palmovic4
1Department of Applied Physics, Faculty of Electrical Engineering and Computing, University of Zagreb, Unska 3, 10000 Zagreb, Croatia
2Department of Physics, Faculty of Science, University of Zagreb, Bijenicka 32, 10000 Zagreb, Croatia 3Chair for Psychology, Faculty of Humanities and Social Sciences, University of Split, Sinjska 2, 21000 Split, Croatia
4Laboratory for Psycholinguistic Research, Department of Speech and Language Pathology, University of Zagreb, Borongajska cesta 83h, 10000 Zagreb, Croatia
(Received 15 March 2018; published 3 January 2019)
Typical physics textbook problems often include supportive diagrams that visualize the physical situation although the potential benefits of providing such diagrams is not yet fully established. We used eye tracking to explore the role of supportive diagrams in problem solving. Including a supportive diagram with the text of the problem improved students percentage of correct answer in one of the six problems used in the study. Eye-tracking data showed that students typically spent less time on the text of the problem if they were presented with a diagram, but the total viewing time did not change. When a diagram was presented students split their attention between the diagram and the text without speeding up problem solving. Cognitive load theory and dual coding theory suggest that giving information in two formats (verbal and visual) might reduce extraneous cognitive load and leave more cognitive resources available for further steps in problem solving. However, this does not necessarily lead to a higher percentage of correct answers to the problem, because supportive diagrams influence only one step of the complex process of problem solving. In addition to the role of diagrams, we evaluated different eye-tracking measures as measures of visual attention during physics problem solving. It seems that the fixation duration is rather constant, and not always sensitive to the manipulation in the task. On the other hand, dwell time and the number of fixations show more variability across problems and participants, so they seem to be appropriate measures of the visual attention. Since dwell time and fixation number are dependent measures, and they show a similar pattern of responses, in most cases it seems sufficient to report only one of them.
DOI: 10.1103/PhysRevPhysEducRes.15.013101
Introduction.—Teaching and learning physics problem solving is an important physics education research (PER) area. Various problem-solving strategies have been proposed, and all of them include some form of drawing diagrams. For example, the PER group at the University of Minnesota suggests sketching a picture in the first step of understanding and visualizing the problem [1]. In the next step of describing the physics of the problem, students are asked to draw a diagram or a graph that helps in understanding the problem (e.g., a free-body diagram). Nevertheless, reports on the role of diagrams in problem
*Corresponding author. ana.susac@fer.hr
Published by the American Physical Society under the terms of the Creative Commons Attribution 4.0 International license. Further distribution of this work must maintain attribution to the author(s) and the published articles title, journal citation, and DOI.
solving are disparate. It has been shown that students who draw diagrams are more successful in problem solving [2]. A reasonable inference would be to prompt students to draw diagrams or to include diagrams in problems as a scaffolding. Some reports suggested that students who were given a diagram describing a physical situation were less likely to draw additional, expertlike diagrams (e.g., to draw vectors) and were less successful in problem solving than those who were prompted to draw the diagrams [35]. On the other hand, prompting novice students to draw force diagrams may result in lower success in problem solving [6]. More research is needed to determine the role of diagrams in problem solving. A recent large-scale study on usefulness of supportive diagrams has shown their small positive effect on students scores [7].
In this study, we use eye tracking to explore the role of supportive diagrams in problem solving. Measurement of eye movements provides detailed information on the underlying cognitive processes during problem solving than conventional assessment methods which provide only
2469-9896=19=15(1)=013101(6)
013101-1
Published by the American Physical Society
SHORT PAPERS
PHYS. REV. PHYS. EDUC. RES. 15, 013101 (2019)
participants score. Eye tracking also gives insight into the role of visual attention during problem solving in mathematics and science [8,9]. However, there is a limited number of PER studies that have used eye tracking [1022]. In addition to information on spatial and temporal characteristics of visual attention, eye-tracking data may provide information on cognitive load. The cognitive load theory was developed to facilitate learning and problem solving [23]. Cognitive load theory specifies three types of cognitive load: intrinsic, extraneous, and germane [24,25]. Intrinsic cognitive load is related to the specific material or skill being learned whereas extraneous cognitive load is associated with the way the material or skill is presented to a learner. Germane cognitive load is produced by the construction of schemas, i.e., permanent stores of knowledge, and is desirable because it supports learning of new materials or skills. According to the dual-coding theory, both visual and verbal representations are used to represent information [26].
The cognitive load theory [23] and the dual-coding theory [26] suggest that the supportive diagrams could be useful in the initial phase of problem solving because presenting the physical situation in two formats (verbal and visual) might reduce the extraneous cognitive load and leave more cognitive resources for further steps in problem solving. Previous non-PER studies showed that various eye-tracking measures can be used as correlates of work load during different tasks [27,28]. However, tasks in PER studies are complex and different from the typical visual search or memory tasks, and the advantages and disadvantages of different eye-tracking measures in PER tasks should be explored. A recent PER study used eye tracking to investigate the effect of intrinsic, extraneous, and germane cognitive load on different eye-tracking measures during multimedia lessons [29]. The results provided insight into the complex interrelations among different eye-tracking measures. In this study, our goal is to further evaluate application of eye-tracking measures during physics problem solving.
We aim to answer the following research questions: (i) Do diagrams help students in problem solving? (ii) What can eye-tracking reveal about the role of
diagrams in problem solving? (iii) Which eye-tracking parameters are appropriate mea-
sures of visual attention during physics problem solving? Methods.—Participants: Sixty undergraduate students from the Department of Physics, University of Zagreb participated in this study. All participants were senior years prospective physics teachers. Each participant gave an informed written consent before taking part in the experiment. Materials: Six multiple-choice questions on energy were used in this study (see the Supplemental Material [30,31]). The questions were always presented in the same order. In one experimental group, the first, third, and fifth questions were presented with diagrams, while the
remaining questions were presented without diagrams. In
another experimental group, the second, fourth, and sixth
questions were presented with diagrams.
Procedure: Eye-movement data were recorded as in our
previous study [14]. Data analysis: The recorded eye-movements data
were analyzed using BeGaze software. Saccade duration and number of regressions are often used eyetracking measures, but might be difficult to interpret for our task because they depend on the spatial layout of the problem and the related diagram. Thus, we evaluate and compare the following eye-tracking measures: dwell time (viewing time), number of fixations, and average fixation duration. We defined three areas of interest (AOIs) for each question and calculated the chosen eyetracking measures for each AOI. AOIs were rectangles that included the text of the problem (question), multiple choice answers (multiple choice), and the related diagram (diagram). In condition “no diagram,” AOI diagram was an empty rectangle at the same place as the diagram in “diagram” condition.
Students responses were graded as either correct or incorrect. The χ2 tests, analyses of variance (ANOVAs) and Bonferroni corrected Students t-tests were conducted with a threshold for significance of p ¼ 0.05.
Results.—Analysis of students responses: Students percentages of correct answers on all questions presented with and without diagrams are shown in Fig. 1. The χ2 tests showed statistically significant differences in percentages of
correct answers between the two conditions (diagram, no diagram) only for question Q3 [χ2ð1Þ ¼ 6:24, p ¼ 0.01]. Overall, the difference between the percentages of correct
answers on questions with diagram and with no diagram was not statistically significant [χ2ð1Þ ¼ 1:63, p > 0.05].
Analysis of eye-tracking data for question Q3: As we
found a positive effect of the supportive diagram on students percentages of correct answers for question Q3, we further explored eye-tracking data for that question.
We compared three eye-tracking measures (dwell time,
number of fixations, and average fixation duration) for AOIs question, multiple choice, and diagram (Fig. 2).
Dwell time was longer for the AOI question if a diagram was not presented [tð58Þ ¼ 4.04, p ¼ 0.0004], but there was no difference for the AOI multiple choice [tð58Þ ¼ 0.43, p > 0.05]. Correspondingly, the fixation number was larger for the AOI question if a diagram was not presented [tð58Þ ¼ 3.85, p ¼ 0.0006] and there was no difference for the AOI multiple choice [tð58Þ ¼ 0.17, p > 0.05]. Average fixation duration was not significantly different for both AOIs, question and multiple choice [tð58Þ ¼ 2.13, p > 0.05; tð58Þ ¼ 1.62, p > 0.05, respectively]. As expected, most participants did not look at the AOI diagram
if the diagram was not presented. Thus, we did not sta-
tistically compare eye-tracking measures for AOI diagram.
When dwell times and fixation numbers were summed
across three AOIs, and fixation duration averaged across
013101-2
SHORT PAPERS
PHYS. REV. PHYS. EDUC. RES. 15, 013101 (2019)
FIG. 1. Comparison of students percentages of correct answers on all questions (Q1Q6) between the two conditions (diagram, no diagram). The error bars represent 1 SEM.
three AOIs, they did not differ for the two conditions [tð58Þ ¼ 1.27, p > 0.05; tð58Þ ¼ 1.09, p > 0.05; tð58Þ ¼ 0.77, p > 0.05, respectively].
Corresponding analysis was conducted for correct and incorrect problem solvers (see the Supplemental Material [30]). The results showed that correct problem solvers had shorter dwell time and a smaller number of fixations compared to incorrect problem solvers, whereas no statistically significant difference was found for the average fixation duration.
Analysis of eye-tracking data for all six questions: Since the diagram decreased the dwell time and fixation number for question Q3, we evaluated its effect on the three eye-tracking measures in AOI question for all questions (Fig. 3). Three two-way mixed-design ANOVAs were conducted on dwell time, number of fixations, and average fixation duration with factors diagram and question. The results obtained showed a statistically significant main effect of both factors on all three eye-tracking measures, whereas the interaction effect was significant for dwell time and fixation number (Table I). Dwell time and fixation number decreased when a diagram was presented in
FIG. 3. Dwell time, number of fixations, and average fixation duration for the AOI question, calculated for the two conditions (diagram, no diagram) and all questions (Q1Q6). The error bars represent 1 SEM.
questions Q1 [tð58Þ ¼ 3.88, p ¼ 0.002; tð58Þ ¼ 3.64, p ¼ 0.004, respectively], Q3 [tð58Þ ¼ 4.04, p ¼ 0.001; tð58Þ ¼ 3.85, p ¼ 0.002, respectively] and Q6 [tð58Þ ¼ 3.70, p ¼ 0.003; tð58Þ ¼ 3.56, p ¼ 0.005,
FIG. 2. Dwell time, number of fixations, and average fixation duration for three AOIs (question, multiple choice, and diagram) and all three summed AOIs (shown in green) on question Q3. The error bars represent 1 SEM.
013101-3
SHORT PAPERS
PHYS. REV. PHYS. EDUC. RES. 15, 013101 (2019)
FIG. 4. Dwell time, number of fixations, and average fixation duration for the AOI diagram, for all questions (Q1Q6). The error bars represent 1 SEM.
respectively]. The presence of a diagram did not statistically significantly change dwell time and fixation number in questions Q2 [tð58Þ ¼ 2.05, p > 0.05; tð58Þ ¼ 1.28, p > 0.05, respectively], Q4 [tð58Þ ¼ 0.88, p > 0.05; tð58Þ ¼ 1.23, p > 0.05, respectively] and Q5 [tð58Þ ¼ 2.31, p > 0.05; tð58Þ ¼ 2.54, p > 0.05, respectively].
Furthermore, eye tracking provided data on the time that
a participant spent attending to the diagram. Figure 4 shows
the three eye-tracking measures for all six questions. One-
way ANOVAs revealed that the dwell time, number of
fixations, and average fixation duration were significantly different across the questions [Fð5; 145Þ ¼ 9.49, p < 0.0001, η2p ¼ 0.247; Fð5; 145Þ ¼ 9.91, p < 0.0001, η2p ¼ 0.255; Fð5; 145Þ ¼ 2.88, p ¼ 0.016, η2p ¼ 0.090]. Figure 4 reveals that participants spent the shortest time attending
the diagram in problem Q5. Regarding our second research question on the evalu-
ation of eye-tracking measures, visual inspection of
Figs. 2, 3, and 4 reveals similar patterns for dwell time and fixation number whereas fixation duration shows smaller variation. Roughly, dwell time is proportional to fixation number multiplied by average fixation duration. Thus, for rather constant average fixation duration, dwell time and fixation number have similar patterns. This conclusion was confirmed by inspecting individual participants data (see the Supplemental Material [30]). There is more variability in the total dwell time and total fixation number compared to average fixation duration. To quantify the variability in different eye-tracking measures, we calculated the coefficient of variation (CV) as a ratio of standard deviation (SD) and mean (Table II). CVof the total dwell time and total fixation number is about 3 times larger than CV of average fixation duration.
Discussion.—Supportive diagrams had a positive effect on students percentages of correct answers in one of the six problems used in the study. This result is in agreement with the previous studies that reported no effect
TABLE I. Results of two-way ANOVAs conducted on dwell time, fixation number, and fixation duration for the AOI question with between-subjects factor diagram (diagram vs no diagram) and within-subjects factor question (Q1Q6).
Dwell time Fixation number Fixation duration
Diagram
F (df)
20.14 (1, 58) 14.53 (1, 58) 6.89 (1, 58)
p
<104 <104 0.01
ηp2
0.258 0.200 0.106
Question
F (df)
9.25 (5, 290) 7.03 (5, 290) 9.94 (5, 290)
p
<104 <104 <104
ηp 2
0.137 0.108 0.146
Interaction
F (df)
p
3.28 (5, 290) 0.007 4.23 (5, 290) 0.001 0.45 (5, 290) >0.05
ηp2
0.530 0.068 0.008
TABLE II. Mean, standard deviation (SD) and coefficient of variation (CV) of total dwell time, total fixation number, and average fixation duration for each problem.
Question
Q1 Q2 Q3 Q4 Q5 Q6
Dwell time (s)
Mean Æ SD
CV
33 Æ 11
0.33
40 Æ 16
0.40
35 Æ 18
0.51
45 Æ 26
0.58
43 Æ 29
0.67
40 Æ 19
0.48
Fixation number
Mean Æ SD
CV
120 Æ 41
0.34
142 Æ 54
0.38
130 Æ 60
0.46
158 Æ 96
0.61
144 Æ 85
0.59
140 Æ 62
0.44
Fixation duration (ms)
Mean Æ SD
CV
240 Æ 33
0.14
250 Æ 38
0.15
237 Æ 36
0.15
255 Æ 43
0.17
260 Æ 47
0.18
250 Æ 45
0.18
013101-4
SHORT PAPERS
PHYS. REV. PHYS. EDUC. RES. 15, 013101 (2019)
or a small positive effect of supportive diagrams on problem solving [7]. The reason why the diagram in question Q3 had a positive effect on students percentages of correct answers could be that it emphasized the initial equal elevations of the bricks (see the Supplemental Material [30]). Although this diagram does not provide any additional information that is not present in the text, it could have influenced student reasoning similarly as visual cues in Madsen et al. [16]. One interpretation may be that a diagram might be beneficial if it highlights some information for problem solving. Overall, our results suggest that, while diagrams can help students in problem solving, the effect is small and rarely statistically significant.
Analysis of eye-tracking data showed that students who were presented with a supportive diagram usually spent less time reading the text of the problem, but the total viewing time was the same for the two conditions (diagram, no diagram). When a diagram was presented students split their attention between the diagram and the text without speeding up problem solving, which was in agreement with our previous findings [12]. Cognitive load theory combined with the dual coding theory implies that information in both formats (verbal and visual) reduces extraneous cognitive load and helps students to better understand the physical situation described in the text. However, better understanding of the physical situation does not necessarily lead to more efficient problem solving. Even if diagrams were useful in the initial phase of problem visualization, they would not guarantee more efficient implementation of the subsequent phases in problem solving. Eye-tracking data do not seem to agree with the hypothesis that presenting information in both formats significantly improves the overall efficiency in problem solving. Further studies are
needed to assess the effect of presenting information in both formats on different steps in problem solving.
One goal of this study was to determine which eyetracking parameters are appropriate measures of visual attention during physics problem solving. Our results show that fixation duration is rather constant across questions and participants, suggesting that it might not be sensitive enough to reveal all differences in visual attention caused by some task manipulation. Although fixation duration is used as a measure of the visual attention, it is possible that it is not so suitable for physics problem solving. On the other hand, dwell time and fixation number show larger variance and sensitivity to task manipulation and therefore seem more appropriate eye-tracking measures for exploring physics problems solving. Similar results are reported in a previous study of website complexity [28] and PER study on kinematic graphs [20]. Patterns of dwell time and fixation number show analogous effects of task manipulation because these two measures are dependent, therefore it might be adequate to report only one of these two measures. In this study, we compared the most used eye-tracking parameters in the PER studies. In future studies, other eye-tracking measures should also be evaluated.
We found that dwell time was sensitive to extraneous load that corroborates the results from Zu et al. [29]. However, Zu et al. reported significant sensitivity of the mean fixation duration to extraneous load that was not found in our study. A possible explanation for this discrepancy could include differences in the design of the study and corresponding data analysis. Further research with different study designs is needed to explore the relationship between eye-tracking measures and cognitive load.
[1] P. Heller, R. Keith, and S. Anderson, Teaching problem solving through cooperative grouping. Part 1: Group versus individual problem solving, Am. J. Phys. 60, 627 (1992).
[2] A. Mason and C. Singh, Helping students learn effective problem solving strategies by reflecting with peers, Am. J. Phys. 78, 748 (2010).
[3] A. Maries and C. Singh, Should students be provided diagrams or asked to draw them while solving introductory physics problems?, AIP Conf. Proc. 1413, 263 (2012).
[4] A. Maries and C. Singh, To use or not to use diagrams: The effect of drawing a diagram in solving introductory physics problems, AIP Conf. Proc. 1513, 282 (2013).
[5] A. Maries and C. Singh, A good diagram is valuable despite the choice of a mathematical approach to problem solving, AIP Conf. Proc. 1513, 31 (2013).
[6] A. F. Heckler, Some consequences of prompting novice physics students to construct force diagrams, Int. J. Sci. Educ. 32, 1829 (2010).
[7] Z. Chen, N. Demirci, Y. J. Choi, and D. E. Pritchard, To draw or not to draw? Examining the necessity of problem diagrams using massive open online course experiments, Phys. Rev. Phys. Educ. Res. 13, 010110 (2017).
[8] M. J. Tsai, H. T. Hou, M. L. Lai, W. Y. Liu, and F. Y. Yang, Visual attention for solving multiple-choice science problem: An eye-tracking analysis, Comput. Educ. 58, 375 (2012).
[9] A. Susac, A. Bubic, J. Kaponja, M. Planinic, and M. Palmovic, Eye movements reveal students strategies in simple equation solving, Int. J. Sci. Math. Educ. 12, 555 (2014).
[10] T. van Gog, F. Paas, and J. J. G. Van Merriënboer, Uncovering expertise-related differences in troubleshooting performance: Combining eye movement and concurrent verbal protocol data, Appl. Cogn. Psychol. 19, 205 (2005).
[11] M. Kozhevnikov, M. A. Motes, and M. Hegarty, Spatial visualization in physics problem solving, Cogn. Sci. 31, 549 (2007).
013101-5
SHORT PAPERS
PHYS. REV. PHYS. EDUC. RES. 15, 013101 (2019)
[12] A. Susac, A. Bubic, P. Martinjak, M. Planinic, and M. Palmovic, Graphical representations of data improve student understanding of measurement and uncertainty: An eye-tracking study, Phys. Rev. Phys. Educ. Res. 13, 020125 (2017).
[13] P. Klein, J. Viiri, S. Mozaffari, A. Dengel, and J. Kuhn, Instruction-based clinical eye-tracking study on the visual interpretation of divergence: How do students look at vector field plots?, Phys. Rev. Phys. Educ. Res. 14, 010116 (2018).
[14] A. Susac, A. Bubic, E. Kazotti, M. Planinic, and M. Palmovic, Student understanding of graph slope and area under a graph: A comparison of physics and nonphysics students, Phys. Rev. Phys. Educ. Res. 14, 020109 (2018).
[15] A. D. Smith, J. P. Mestre, and B. H. Ross, Eye-gaze patterns as students study worked-out examples in mechanics, Phys. Rev. ST Phys. Educ. Res. 6, 020118 (2010).
[16] A. M. Madsen, A. M. Larson, L. C. Loschky, and N. S. Rebello, Differences in visual attention between those who correctly and incorrectly answer physics problems, Phys. Rev. ST Phys. Educ. Res. 8, 010122 (2012).
[17] A. Madsen, A. Rouinfar, A. M. Larson, L. C. Loschky, and N. S. Rebello, Can short duration visual cues influence students reasoning and eye movements in physics problems?, Phys. Rev. ST Phys. Educ. Res. 9, 020104 (2013).
[18] A. Rouinfar, E. Agra, A. M. Larson, N. S. Rebello, and L. C. Loschky, Linking attentional processes and conceptual problem solving: Visual cues facilitate the automaticity of extracting relevant information from diagrams, Front. Psychol. 5, 1094 (2014).
[19] M. Kekule, Students approaches when dealing with kinematics graphs explored by eye-tracking research method, Proceedings of the Frontiers in Mathematics and Science Education Research Conference (Science Education research Group at Eastern Mediterranean University, Famagusta, 2014), pp. 108117, ISSN 2301-251X.
[20] M. Kekule, Students different approaches to solving problems from kinematics in respect of good and
poor performance, Proceedings of the International Conference on Contemporary Issues in Education, Dubai (2015), pp. 126134. [21] S. C. Chen, H. C. She, M. H. Chuang, J. Y. Wu, J. L. Tsai, and T. P. Jung, Eye movements predict students computerbased assessment performance of physics concepts in different presentation modalities, Comput. Educ. 74, 61 (2014). [22] J. Han, L. Chen, Z. Fu, J. Fritchman, and L. Bao, Eye-tracking of visual attention in web-based assessment using the Force Concept Inventory, Eur. J. Phys. 38, 045702 (2017). [23] J. Sweller, Cognitive load during problem solving: Effects on learning, Cogn. Sci. 12, 257 (1988). [24] J. Sweller, Cognitive load theory, learning difficulty, and instructional design, Learn. Instr. 4, 295 (1994). [25] J. Sweller, Element interactivity and intrinsic, extraneous, and germane cognitive load, Educ. Psychol. Rev. 22, 123 (2010). [26] J. M. Clark and A. Paivio, Dual coding theory and education, Educ. Psychol. Rev. 3, 149 (1991). [27] K. F. Van Orden, W. Limbert, S. Makeig, and T.-P. Jung, Eye activity correlates of workload during a visuospatial memory task, Human Factors 43, 111 (2001). [28] Q. Wang, S. Yang, M. Liu, Z. Cao, and Q. Ma, An eye-tracking study of website complexity from cognitive load perspective, Decision Support Systems 62, 1 (2014). [29] T. Zu, J. Hutson, L. C. Loschky, and N. S. Rebello, Use of eye-tracking technology to investigate cognitive load theory, Proceedings of the Physics Education Research Conference 2017, Cincinnati, OH (AIP, New York, 2017), p. 472. [30] See Supplemental Material at http://link.aps.org/ supplemental/10.1103/PhysRevPhysEducRes.15.013101 for the problems used in the study, the comparison of correct and incorrect problem solvers, and participants individual dwell time, fixation number, and fixation duration. [31] N. Brkovic, Zbirka zadataka iz fizike (LUK, Zagreb, 2001).
013101-6

View File

@@ -0,0 +1,16 @@
Title: Role of diagrams in problem solving: An evaluation of eye-tracking parameters as a measure of visual attention
Subject: Phys. Rev. Phys. Educ. Res. 15, 013101 (2019). doi:10.1103/PhysRevPhysEducRes.15.013101
Keywords: doi:10.1103/PhysRevPhysEducRes.15.013101 url:https://doi.org/10.1103/PhysRevPhysEducRes.15.013101
Author: Ana Susac
Creator: Published by the American Physical Society
Producer: Acrobat Distiller 10.0.0 (Windows)
CreationDate: 01/03/19 00:00:00
ModDate: 01/03/19 00:00:00
Tagged: no
Form: none
Pages: 6
Encrypted: no
Page size: 612 x 792 pts (letter) (rotated 0 degrees)
File size: 571719 bytes
Optimized: yes
PDF version: 1.4

View File

@@ -0,0 +1 @@
{"pageIndex":0,"scale":"page-width","top":582,"left":-7,"scrollMode":0,"spreadMode":0}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,16 @@
Title: LNCS 7700 - Neural Networks: Tricks of the Trade
Subject: Neural Networks: Tricks of the Trade
Keywords:
Author: Grégoire Montavon, Geneviève B. Orr, and Klaus-Robert Müller (eds.)
Creator: gnuplot 4.2 patchlevel 2
Producer: Acrobat Distiller 10.0.0 (Windows)
CreationDate: 11/10/12 15:07:30
ModDate: 11/14/12 16:54:53
Tagged: no
Form: AcroForm
Pages: 753
Encrypted: no
Page size: 439.363 x 666.131 pts (rotated 0 degrees)
File size: 12243176 bytes
Optimized: yes
PDF version: 1.6

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
Title: Zahlenspiegel der Leibniz Universität Hannover 2024
Author: Leibniz Universität Hannover - Präsidialstab PS4.3 (M. Flechtner)
Creator: Adobe InDesign 19.5 (Windows)
Producer: Adobe PDF Library 17.0
CreationDate: 01/21/25 08:09:45
ModDate: 01/21/25 08:16:10
Tagged: no
Form: AcroForm
Pages: 82
Encrypted: AES 256-bit
Permissions: print:yes copy:no change:no addNotes:no
Page size: 595.279 x 841.89 pts (A4) (rotated 0 degrees)
File size: 3877417 bytes
Optimized: yes
PDF version: 1.7

View File

@@ -0,0 +1,474 @@
474
IEEE TRANSACTTONS ON SYSTEMS, MAN, AND CYBERNETICS, VOL. 22, NO.3, MAY/JUNE 1992
Visual Perception and Sequences of Eye Movement Fixations: A Stochastic Modeling Approach
Selim S. Hacisalihzade, Senior Member, IEEE, Lawrence W. Stark, Fellow, IEEE, and John S. Allen
Abstract- Sequences of visual fixations, while looking at an object, are modeled as Markov processes and statistical properties of such pmcesses are derived by means of simulations. The sequences are also abstracted as character strings and a quantitative method of measuring their similarity, based on minimum string editing cost (actually dissimilarity distance) is introduced. Interrelationshipsbetween the structure and size of the generating Markov matrices and the string editing distance shed light on the relative roles of deterministic and probabilistic processes in producing human visual scanpaths.
I. INTRODUCTION
EYE MOVEMENTS are necessary for vision while looking at an object that spans more than several degrees in the subjects field of view, because detailed visual information can only be obtained through the fovea, the small (about one degree) central area of the retina that has the highest photoreceptor concentration. Therefore, the brain directs the eye to move in such a way as to foveate successively onto the points of interest [15]. While viewing a stationary object, the eyes alternate between fixations and saccades, very rapid eye movements. Each saccade leads to a new fixation. Typically there are about three saccades per second, but since saccades are so fast, they occupy only about 10% of the total viewing time. Vision is suppressed during the saccades and thus almost
all the visual information is collected during the fixations [lo].
Clearly, eye movements play a very important role in visual perception. It has been found about two decades ago that people have repetitive and idiosyncratic ways of inspecting and recognizing a particular familiar object; these patterns were named scanpaths [7].
Storing and retrieving memories are important components of visual learning and recognition. Therefore, the memory system of the brain must contain an internal representation of every object that is to be recognized. Thus, familiarizing oneself with an object may be considered as the process of constructing this representation. Similarly, recognition of an object may be viewed as the process of matching it with its stored internal representation.A non-Gestalt view suggests that the internal representation is made of components and that during recognition the features of the model are matched step
Manuscript received November 16, 1990, revised August 22, 1991. This work was supported in part by the Swiss Academy of Medical Sciences and in part by the Swiss National Science Foundation grant 5.521.330.615/7.
S . S . Hacisalihzade is with Landis & Gyr, Corporate Research and Development, CH-6301 Zug, Switzerland.
L. W.Stark is with the University of California, Berkeley, CA 94720.
J. S . Allen is with the Departmentof Anthropology,Universityof Auckland, New Zealand.
IEEE Log Number 9104550.
by step with the object. In support of a serial process, it is known that the eyes seem to visit the features of the object under study cyclically, following somewhat regular scanpaths rather than crisscrossing it at random. A serial model of internal representations of objects based on this evidence is the so called “scanpath feature ring” [8].This model maintains that the representation of objects are composed of sensory memory traces recording the features and motor memory traces of the eye movements from one feature to the other. A modified and more realistic version of this model introduces randomness into the generation of scanpaths.
Markov matrices were used by Stark and Ellis [113in earlier studies in an attempt to go beyond visual inspection of the eye movement traces and application of a subjective test for similarity of such traces. They used Markov matrices identified from experimental sequences and showed the existence of a few structured processes. They also looked at structures beyond the first order Markov matrix, i.e., do the states n - 1,
n - 2, . . . previous to the present state n affect the probability
+ of transition to the next state n l? Unfortunately, the size
of the higher order Markov matrices increases geometrically with the order. Thus very large experimental sequences would have been necessary for the identification of these matrices. For a nonstationary generator, as the actively looking human, this poses obvious experimental difficulties. Can one further quantify the similarity of sequences of visual fixations?
String editing is also a possible way to study the similarity of sequences by looking at the similarity of the corresponding strings. A reasonable way of doing this is to define a distance between strings that characterize sequences of visual fixations and to set a threshold below which strings, thus sequences of visual fixations, are similar. The question,about the similarity of strings is a problem that has been occupying computer scientists [13] as well as biologists studying RNA and DNA sequences [3].
Aim of this paper is to demonstrate the feasibility of quantifying the similarity of visual fixation sequences while looking at familiar objects.
11. A MARKOVIANMODELOF SEQUENCES OF VISUAL FIXATIONS
Let us now divide the image under study (Fig. 1)into several regions of interest and label them with letters like the hand
(A), the mouth (B),the nose (C), the left eye (D)t,he right
eye (E), the neck (F),and the ear (G). If we call these regions states into which the fixations must be located and postulate that the transitions from one state to another have
0018-9472/92$03.00 0 1992 IEEE
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 06,2024 at 16:40:58 UTC from IEEE Xplore. Restrictions apply.
111 I
HACISALIHZADE ef al.: VISUAL PERCEPTION AND SEQUENCESOF EYE MOVEMENT FIXATIONS
475
certain probabilities, we can effectively describe the generating process for these sequences of fixations as Markov processes [ 5 ] . In particular, the sequence O A of fixations in Fig. 1 are BBFFAAAABCEDCG. We can see that this sequence could have been generated by the Markov matrix MG (note the terminal fixation on the ear):
'0.75 0.25 0 0 0 0 0
0 0.33 0.33 0 0 0.33 0
0 0 0 0 0.5 0 0.5
0 0 10000
0 0 01000
0.5 0
,o 0
0 0 0 0.5 0 000 0 1
The same matrix could generate many other sequences as other realizations of the generating process, for instance
OB =BFFAAAABFABBCEDCEDCGGGG;note that lengths
of the sequences do not need to be equal; also note that in this case a terminal string of G's is produced. However, when we calculate the transition probabilities that occurred in this second sequence OB and- summarize these probabilities in another Markov matrix M B resulting from the second sequence, we get
0.6 0.4 0 0 0 0 0 0 0.25 0.25 0 0 0.5 0 0 0 0 0 0.67 0 0.33 0 0 100 0 0 0 0 010 0 0 0.67 0 0 0 0 0.33 0 -0 0 0 0 0 0 1
where & l ~is an identification of the generating process
char_acterizedby MG;the sequential data in UB is summarized in MB.
We now define
Fig. 1. Eye movements made by a subjectviewing for the first time a drawing adapted from the Swiss artist Klee. Numbers show the order of the subject's visual fixations on the picture during free viewing. Lines between fixations represent rapid saccades from one fixation to the other.
no other transitions are possible. For n = 4 the corresponding Markov matrix is
0 0.9 0.1 0 0 0.9 0.1
E=M~-M~
0.9 0.1 0
as the error or statistical discordance matrix between the idealized-generating matrix MG and the estimated or observed matrix MB. Note: We have assumed MG as the generating matrix and thus denied it is an &evien~ though we in fact obtained this illustrative case backward. E is clearly a function of the number of states and the lengths of the sequences as well as a function of the elements of the generating matrix, that is, the structure of the matrix. A possible scalar measure of the statistical discordance matrix E is the typical error of each element defined as
with eij as the elements of the error matrix E and n its dimension.
Let us now look at the typical errors resulting from several processes with different matrix sizes, sequences of different lengths and structures. The first process in consideration is a process in which a transition occurs from one state to the next with 90% and to the one after the next with 10% probability;
In the second process a transition occurs from one state to
the next with 70% and to the one after the next with 30%
probability; no other transitions are possible. For n = 4 the
corresponding Markov matrix is
0.0 0.7 M 2 = [ 0.0 0.0
0.3 0.7
]0.0
0.3
0.3 0.0 0.0 0.7 '
0.7 0.3 0.0 0.0
In the third process a transition occurs from one state to the next with 50% and to the one after the next with 10% probability; all other transitions are equiprobable. For n = 4
] the corresponding Markov matrix is 0.2 0.5 0.1 0.2 M 3 = [ 0.2 0.2 0.5 0.1 0.1 0.2 0.2 0.5 0.5 0.1 0.2 0.2
In the fourth process all transitions occur equiprobably from one state to any other. For n = 4 the corresponding Markov
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 06,2024 at 16:40:58 UTC from IEEE Xplore. Restrictions apply.
476
IEEE TRANSACTIONSON SYSTEMS,MAN, AND CYBERNETICS, VOL. 22, NO. 3, MAY/JUNE 1992
1
1-
-.
-3
-- 4 -5 -6 -7
.01
.01
,001 1 0
100
I000
10000
--
I .I
.01
1
,001 10
,001
100
1000
10000
10
100
1000
(4
Fig. 2. Charts (a)-(d) show the typical size of the elements of the error matrices E1 . . .E4, which are-the difference between the
Markov matrices M I . . .Ad4 (generating sequences) as defined in the text and the Markov matrices Mi . M4 (computed from the generated sequences) for 3 . . '9 states and 3 3 . . ,2673 long sequences.
10000
matrix is
[ 0.25 0.25 M4= 0.25
0.25 0.25 0.25
0.25 0.25 0.25
I0.25
0.25 0.25 '
0.25 0.25 0.25 0.25
111. A STRING EDITINGMODELOF SEQUENCESOF VISUAL FIXATIONS
It is possible to define the distance between two strings of not necessarily the same length as the cost of editing one to get the other. Editing a string has three basic operations:
Typical errors for M I .. .M4 with n = 3 . .~9 and sequence substitution, deletion and insertion. One must first define the
lengths of 33 ...2763 averaged over 30 simulations for each cost for each such operation.
combination document a number of interesting relationships For example, let us say that all substitutions are assigned
(see Fig. 2): The typical error gets smaller as a linear function a cost of 2 and both deletions and insertions a cost of 1. To
of the string length in a double logarithmic scale. Typical transform the string ACA to CADAC one has to insert a C
error is also about ten times less for the quasideterministic at the beginning (1) and a C at the end (1) and substitute
processes characterized by M I and M2. Also, the number the C in the middle with a D (2), resulting in a total cost,
of states almost does not affect the typical error for a given ,thus a distance, of 4. It is, of course, also possible to have
string length. This is more true for the more random processes more complicated cost assignments like setting the cost of
characterized by M3 and M4.
substituting a letter with a letter following or preceding it in
Fig. 3 shows some simulated scanpaths generated by the the alphabet as 1, the cost of substituting a letter with a letter
matrices M I ...M4 superimposed on a drawing.
following or preceding it in the alphabet by two letters as 2
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 06,2024 at 16:40:58 UTC from IEEE Xplore. Restrictions apply.
111 I
HACISALMZADE et al.: VISUAL PERCEPTIONAND SEQUENCESOF EYE MOVEMENT FIXATIONS
Fig. 3. (a)-(d) Simulated scanpaths on seven likely fixation points generated by matrices M I .. .M4 superimposed on a line
drawing adapted from a painting by Charpentier (figure provided by G . Tharp.)
and so on. As the strings get longer, the ways of transforming them increase very fast and it becomes no longer trivial to find the transformation that costs least. Therefore, an algorithm based on a modified dynamic programming that guarantees to find the minimum distance between two strings was developed exactly for that purpose [14].
When we apply this algorithm with the cost of substitution
as 1, of insertion as 2 and of deletion as 3 (these costs were found empirically) on sequences of visual fixations depicted in Fig. 4 we get the distance d between Figs. 4(a) and (b) as 10, between Figs. 4(a) and (c) as 15 and between 4@) and (c) as 25. Therefore, the sequences in Figs. 4(a) and (b) are more similar to each other than the ones in Figs. 4(a) and (c) or the ones in Figs. 4(b) and 4(c). This result confirms what
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 06,2024 at 16:40:58 UTC from IEEE Xplore. Restrictions apply.
Ill I
478
IEEE TRANSACTIONSON SYSTEMS, MAN, AND CYBERNETICS,VOL. 22, NO. 3, MAYIJUNE 1992
Fig. 4. Eye movements of a subject while viewing a simple drawing shows the presence and absence of repetitive cyclic scanning of the image. Numbers show the order of the subjects visual fixations. The labeled circles were drawn after the experiments to group and label the visual fixations. A visual inspection shows the paths of scanning in figures (a) and @) to be similar, while the path in (c) does not resemble the ones in either (a) or (b). The fixation sequences are characterized as strings of letters where each letter denotes a fixation in the region labeled with that letter. The distances between the strings are computed as a cost of editing one string to attain the other and are (a)-@) 10, (a)-(c) 15, and @)-(c) 25.
one would deduce by a visual examination of the sequences of visual fixations alone. Thus this method appears to be useful to automate, objectify and quantify the similarity of sequences of visual fixations while looking at an object.
Iv. STRING EDITINGMEASURESOF SIMILARITY
OF SEQUENCES
Another question of interest that now arises in the light of the past two sections is the following: What are the string editing measures between different realizations of the same Markovprocess, or in other words, what can we say about d, the distances between sequences generated by the same Markov matrix?
A simulation study was conducted, where for each of the
Markov matrix structures MI .. .M4 and each number of states
3 . . . 9 , 300 sequences of the length 33 (typical number of fixations during a viewing period of 10 s) were generated. Subsequently, the distances between these 300 sequences were measured with costs of substitution, deletion and insertion being chosen as unity.
As Fig. 5 shows, the mean distance increases with increasing size for all matrix structures (less so for quasideterministic processes). This was to be expected, because with increasing matrix size, that is, with the introduction of new states, the probability of a different sequence being generated increases. The lowest curves in Fig. 5 belong to
M I and A&. In other words, the most similar sequences
are generated with rigidly structured Markov processes that are very close to a deterministic process in which case the transition matrix would have only ones in the superdiagonal (Jordan form) and the sequences generated by such matrices would have a distance equal to zero from each other. Clearly, MI is closer to a deterministic process than Mz. Therefore, the mean distance between the sequences generated by MI is smaller than the ones generated by M2. In the same vein, the process that is “most random,” that is, with equal transition probabilities between each state, results in the largest distances between the resulting strings. Fig. 6 shows how the distribution of distances vary with the number of states for the four generating matrix structures.
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 06,2024 at 16:40:58 UTC from IEEE Xplore. Restrictions apply.
~
,I/ I
HACISALIHZADE et al.: VISUAL PERCEPTION AND SEQUENCESOF EYE MOVEMENTFIXATIONS
479
could make the costs string element dependent, such that the
distance between the sequences ABCD and ABCE is less
than the distance between ABCD and ABCF.
Features that are frequently utilized in active looking may
have resulted in the evolution of internal mechanisms or
feature detecting processes that have been embedded into
:
10
“firmware” in the visual brain. When people look at an image, they look mainly at the parts of the picture that are regarded as being its features; they are the parts that hold the most information about the image. When subjects view simple
pictures, their fixations tend to cluster around the points of
oe the image where line directions change most abruptly [l]. Therefore, one can hypothesize that the angles are the principal features the brain employs to store and recognize images.
10
There is further evidence encouraging that conclusion, namely
Number of states
the existence of angle-detectingneurons in the frogs retina [6],
Fig. 5. A Markov process generates sequences of a given length. The mean distance between such sequences of the length 33 as obtained from 300 simulations for each combination is shown as a function of the number of states and the corresponding matrix structure.
and complex cells in cats and monkeys [4]. This also make sense from a memory space optimization point of view (data compression): if the object is divided into straight segments connected with each other, it is more reasonable to store the
V. DISCUSSION
length of a segment and the angle that connects it to the next segment rather than storing the continuation of the segment
Scanpath theory predicts similar sequences of visual fixa- at predetermined intervals. This is analogous to the storage
tions for a subject looking at a particular image. The presence of large sparse matrices (as encountered, for example, in
of similarity is mostly determined by a visual inspection of the power network problems) where the positions of the elements
fixation sequences. The use of string editing method can help different from zero and their values are stored instead of
with the automation of determining this similarity by reducing storing, say, 10 OOO elements of which only about one percent
a sequence of visual fixations to strings and by determining are nonzero.
the distance between the strings. But in order to increase the Understanding how humans recognize objects can also be
statement power of this tool, we also had to find out about the transferred to machine vision to result in top-down image
statistics of randomly generated strings. This way, it can be processing methods [12]. Furthermore, a simplistic analogy
stated with a certain confidence that the distance between two of the feature ring theory can be applied for the recognition
given sequences of visual fixations is below a threshold due of (convex or concave) polygons [9].First, the image must be
to their similarity and not simply by coincidence.
preprocessed to enhance contrast and to detect edges. Then,
An important problem inherent to this approach is the the image is recoded in terms of corners and lengths of straight
arbitrariness involved in the clustering and regionalization of edges between comers (for curved edges, discretization can be
fixations. For instance, why did we choose to include the employed to introduce corners). Corners can, for instance, be
fixation number 13 in Fig. 1 in the nose region? We could labeled with upper case letters and lengths with lower case
just as well have chosen to include it in the mouth region by letters or other special characters. It makes sense to normalize
changing the threshold between the mouth and the nose. Also, the lengths by the first length. This way, the string will be
why didnt we subdivide the hand to fingers and palm and characteristic of the object independent of its translational or
include fixation 7 in the palm area and fixations 5, 6, and 8 rotational position or distance from the camera. (Of course,
in the fingers are? It might be interesting to apply clustering this string will initially be dependent on the first comer that
algorithms as they are used in cosmology or elsewhere in the system recognizes, but this can be taken care of by a
image processing to decide to which group of points a certain shift operator.) Once the image is compressed to a string
fixation belongs [2]. But even such algorithms have several representation, it can be compared with objects known to the
free parameters that the user must set. It is, of course, possible system as described previously. The system will recognize
to divide the image into a regular grid but this loses any the object under study as the object that has the shortest
reference to the contents of the image.
distance in the string editing sense from known objects. If
Another problem involves the arbitrariness in choosing the the minimal distance is larger than a predefined threshold, the
costs of different editing operations in determining the distance system can be programmed to learn and store it as a new
between two strings. That is, while studying the similarity object in its library. This approach of minimizing distances
of sequence of fixations in Fig. 4, why did we choose the makes the method robust with respect to noise and errors in
costs of substitution, insertion and deletion as 1, 2, and 3 contrast enhancement and edge detection.
respectively? We could have assigned to them equal values as In summary, sequences of visual fixations while looking at
in the simulation study of the previous section. Also, assuming an object were modeled as Markov processes. A method of
in a further, hypothetical example that a certain region D is quantifying the similarity of eye movements while looking at close to another region E but far away from region F , we an object was introduced. This method is based on reducing
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 06,2024 at 16:40:58 UTC from IEEE Xplore. Restrictions apply.
Ill 0
480
IEEE TRANSACTIONS ON SYSTEMS,MAN, AM) CYBERNETICS,VOL.22, NO. 3, MAY/JUNE 1992
0
430
20
10
0
0
10
20
(a)
40 -
30
30
20
10
0 0
10
20
30
@)
>03
T ---
............. . .
0
10
20
30
0
10
20
30
(4
(4
Fig. 6. The distribution of distances between sequences generated by the same Markov matrix gets more evenly and widely spread around the mean distance as the randomness of the process increases from (a) Ml-(d) M4.
the sequence of visual fixations to a sequence of letters and defining an editing cost as the distance between such strings. Advantages and shortcomings of this method were discussed together with the possibility of its application in machine vision for the robust recognition of objects. Results of an experimental study of visual fixations with the string editing algorithm will be presented in a future paper.
ACKNOWLEDGMENT The authors thank Greg Tharp for providing Fig. 3.
REFERENCES
[I] F. Attneave, “Some informational aspects of visual perception,” Psychol. Rev., vol. 61, pp. 183-193, 1954.
[2] J. C. Bezdek, Pattern Recognition with Fuzzy Objective Function Algorithms. New York Plenum, 1981.
[3] M. Eigen, R. Winkler-Oswatitisch, and A. Dress, “Statistical geometry in sequence space: A method of quantitative comparative sequence
analysis,” Proc. Nat. Acad. Sci., vol. 85, 1988, pp. 5913-5917.
[4] D.H. Hubel and T. N. Wiesel, “Receptive fields, binocular interaction
and functional architecture in the cats visual cortex,” J . PhysioL, vol.
160, pp. 106-154, 1962. [5] J. G. Kemeny and J. L. Snell, Finite Markov Chins. New York
Springer, 1983. [6] J. Y.Lettvin, H. R. Maturana, W. S. McCulloch, and W. H. Pitts, “What
the frogs eye tells the frogs brain, in Proc. IRE, vol. 47, 1959, pp.
1940-1951. [7] D. Noton, L. W. Stark, “Scanpaths in eye movements during pattern
perception,” Science, vol. 171, pp. 308-311, 1971. [8] -, “Eye movements and visual perception,” Scientific Amer., vol.
221, no. 6, pp. 34-43, 1971. [9] R. Schubiger, J. Moser, S. S. Hacisalihzade, and M. A. Muller, “Machine
vision based on human perception and eye movements,” to be presented
at the IEEE Eng. Medicine Biology Soc. 13th Annu. Int. Conf., Orlando, FL, Nov. 1991.
[lo] L. W. Stark, J. A. Michael, and B. L. Zuber, “Saccadic suppression: A
product of the saccadic anticipatory signal,” in Attention in Neurophysi-
ology. C. R.Evans, and T. B. Mulholland, Eds. London: Butterworths,
1969.
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 06,2024 at 16:40:58 UTC from IEEE Xplore. Restrictions apply.
HAClSALMZADE er al.: VISUAL PERCEPTION AND SEQUENCESOF EYE MOVEMENT FIXATIONS
L. W. Stark and S . R. Ellis, “Scanpaths revisited Cognitive models, direct active looking,” in Eye Movements: Cognition and VisualPerception, D. F. Fisher, R. A. Monty, and I. W. Senders, Eds. Hillsdale, NJ: Lawrence Erlbaum, 1981.
L. W. Stark, B. Mills, A. H. Nguyen, and H. X. Ngo, “Instrumen-
tation and robotic image processing using top-down model control,” in Robotics and Manufacturing, M. Jamshidi, et al., Eds. New York ASME Press, 1988. R. A. Wagner and M. J. Fischer, “The string-to-string correction problem,” J . ACM, vol. 21, pp. 168-173, 1974. R. A. Wagner, “On the complexity of the extended string-to-string correction problem,” in lime Warps, String Edits, and Macromolecules: The Theory and Practice of Sequence Comparison, D. Sankoff, J. B.
Kruskal, Eds. Reading, MA: Addison-Wesley, 1983. B. L. Zuber, Models of Oculomotor Behavior and Control. Boca Raton,
n:CRC Press, 1981.
481
Lawreoce Stark (SM61-F70) is a Professor at the University of Califomia, Berkeley, where he divides his teaching efforts between the EECS and ME Departments in engineering and between the Physiological Optics and Neurology Departments in biology and medicine. His research interests are in bioengineering, with emphasis on human and robotic control of movement and vision. He pioneered the application of control and information theory to neurological systems.
Selim Hacisalihzade (SBI-M81SM90) was born in Istanbul, Turkey, in 1957. He received the diploma of electrical engineering, the postdiploma in automatic control, and the doctorate in electrical engineering, all from the Swiss Federal Institute of Technology (ETH), Zurich, Switzerland, in 1980,
1983, and 1986 respectively. He was a Research Associate at University of
California, Berkeley (1987-1989), and an NRC Fellow at NASA Ames Research Center, Califomia (1988-1989). He is currently head of Technology Observation and European Community (EC) R&D Projects at Landis & Gyr
and a lecturer at the Swiss Federal Institute of Technology (ETH). Dr. Hacisalihzade is the founder and the current Chairman of
the IEEE Engineering in Medicine and Biology Society Chapter in Austria/Germany/SwitzerlandH.e is the author or coauthor of more than 50 papers in the general field of automatic control applications in life sciences.
John Allen was born in 1961 in Iowa City, IA. He received the B.A. degree in molecular biology and anthropology in 1983, and the Ph.D. degree in biological anthropology in 1989, from the University of California, Berkeley.
His research interests include human behavioral evolution, the use of biological markers in the cross-cultural study of behavioral diseases (e.g., eye movement dysfunction and schizophrenia),and the history of anthropology. He has published articlesin
Perspectives in Biology and Medicine, Ergonomics, Human Biology, Current Anfhropology, and Biological Psychiatry. After completing a postdoctoral fellowship in the Stanford University School of Medicine, Department of Psychiatry and Behavioral Sciences,he has recently been appointed a Lecturer in Biological Anthropology at the University of Auckland, New Zealand.
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on February 06,2024 at 16:40:58 UTC from IEEE Xplore. Restrictions apply.

View File

@@ -0,0 +1,15 @@
Title: Visual perception and sequences of eye movement fixations: a stochastic modeling approach
Subject: IEEE Transactions on Systems, Man, and Cybernetics;1992;22;3;10.1109/21.155948
Author: S.S. Hacisalihzade;L.W. Stark;J.S. Allen
Creator: Acrobat Capture 3.0
Producer: Adobe PDF Library 4.0; modified using iText® 7.1.1 ©2000-2018 iText Group NV (AGPL-version)
CreationDate: 04/25/04 02:23:53
ModDate: 10/27/18 23:48:06
Tagged: no
Form: none
Pages: 8
Encrypted: no
Page size: 620.16 x 798.119 pts (rotated 0 degrees)
File size: 1311563 bytes
Optimized: no
PDF version: 1.3

View File

@@ -0,0 +1,237 @@
Revisiting Visual Attention Identification Based on Eye Tracking Data Analytics
Yingxue Zhang 1, Zhenzhong Chen 2
School of Remote Sensing and Information Engineering, Wuhan University Wuhan, Hubei, China 1 grace@whu.edu.cn 2 zzchen@whu.edu.cn
Abstract—Visual attention identification is crucial to human visual perception analysis and relevant applications. In this paper, we propose a comprehensive visual attention identification algorithm consisting of clustering and center identification. In the clustering process, a spatial-temporal affinity propagation method for accurate fixation clustering is proposed. For the identified clusters, the random walk based method is utilized to extract the center of each cluster, which presents the essential part of an area of interest (AOI). The proposed approach addresses the problem of fixation overlapping in eye movement analytics. Compared with the state-of-the-arts methods, the proposed method shows superior performance for different eye tracking experiments.
Index Terms—Visual attention; eye tracking; clustering; affinity propagation; random walk
I. INTRODUCTION
Eye movements on visual targets attach much importance to cognition-related researches [1]. The eye tracking data can quantitatively reflect the visual perception behaviors. In related domains, attention is typically paid on eye movements in terms of fixations and saccades.
To apply the raw data to further analysis, we should at first figure out the fixations and saccades. Thus the classification and clustering process should be implemented. Many articles have illustrated applicable algorithms for classification and clustering, such as Velocity-Threshold Identification(I-VT), Hidden Markov Model fixation Identification (I-HMM) [2], Dispersion-Threshold Identification (I-DT) [3], K-means [4], projection techniques and density-based combing clustering [5], and agglomerative hierarchical clustering [6]. The interpretation of eye movements varies greatly when different algorithms or parameter settings are applied.
The fixation centers make a crucial indicator of how the subjects comprehend different objects. While viewing a target, the subjects tend to be attracted by salient objects that can be simplified as centers of AOIs, so that the fixations mostly gather around certain centers [7]. With fixation centers identified, further applications that need accurate visual focus, such as eye tracking assisted human-computer interaction, virtual
reality, can be guaranteed. In most situations, the mean based methods are applied to generate centers of visual attention [8].
However, there are still some problems existing in the visual attention identification process. Most clustering algorithms adopted in the current eye tracking systems are conducted merely in single dimension, leading to either redundant results or lack of practical meaning. Moreover, for visual attention center identification, the widely-used mean based methods ignore the inner spatial relation among fixations by taking each point equally and are sensitive to noises. Under this circumstance, the fixation cluster overlapping and center deviation are critical problems in eye movement analytics.
Considering the deficiency, we propose a visual attention identification algorithm based on the combination of spatialtemporal affinity propagation clustering and random walk. The algorithm takes account of varieties of attributes in eye movement data, such as distance, duration and density, to handle the problem and proves improved performance in the experiments.
The paper is arranged as follows. Section II illustrates our comprehensive analytics of eye tracking data for visual attention identification. Section III shows the experiments. Section IV makes a conclusion for the paper.
II. VISUAL ATTENTION IDENTIFICATION
Given a set of raw eye tracking data, the I-VT algorithm is implemented firstly to remove saccades and obtain the initial fixation clusters separated by saccades. Then random walk is conducted on each cluster to generate initial centers which will be clustered with the affinity propagation. The extra clustering process ensures the centers around the same AOI being merged. The initial clusters will also be merged to form the final clusters accordingly using affinity propagation. For each final cluster, we perform random walk based method again to identify final centers of AOIs. The overall work flow is shown in Fig. 1.
A. Clustering Eye Tracking Data
In this work, the spatial-temporal clustering is conducted on the basis of I-VT and affinity propagation. I-VT is widely used in the eye tracking data classification and clustering process for its simplicity. However, since only velocity is considered,
978-1-5090-5316-2/16/$31.00 ⃝c 2016 IEEE
VCIP 2016, Nov. 27 30, 2016, Chengdu, China
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on December 11,2024 at 09:47:05 UTC from IEEE Xplore. Restrictions apply.
Fig. 1. The work flow of the proposed method.
the algorithm faces the problem of cluster overlapping in most cases, which is the primary problem we aim to solve.
When the subjects focus on a stimulus, groups of points will be recorded, which may include some spatially overlapped groups. If the overlapped groups are close enough, they are considered to belong to one AOI being reviewed several times. But I-VT can not figure out the relationship for it merely gathers temporally consecutive fixations separated by velocity. In this case, there will be redundant clusters around one AOI, which makes an interference to further analysis.
Concerning the problem, we combine affinity propagation [9] with I-VT. Affinity propagation is a robust algorithm in the spatial dimension. It takes spatial distance as similarity so that it ideally handles the problem of overlapping in I-VT.
1) Classification and Initial Clustering: Since the saccades move at a much higher velocity than fixations, I-VT separates the fixations from saccades, discards the saccades and gathers consecutive fixations into clusters with a velocity threshold [2]. By this means, we obtain initial fixation clusters reflecting the moving trajectory of visual attention. The velocity of an eye movement point can be equivalent to the Euclidean distance between the point and the next in calculation on account of a constant recording rate. The velocity threshold is set to 20 for a better final consequence.
2) Initial Center Identification with Random Walks: After I-VT process, initial clusters are generated. Provided that the revisit condition exists, there should be extra clusters that must be merged. The merging criteria is the distance between two clusters. To appropriately calculate the distance, we implement random walk on each initial cluster to identify a center which can best represent the cluster in distance calculation. Since the random walk based method utilized here is the same as the final center identification process, the specific algorithm details will be introduced in the center identification section.
3) Final Clustering Using Affinity Propagation: Obtaining the centers representing the initial clusters, we conduct affinity propagation on the centers in order to merge the clusters belonging to the same AOI. We use the centers to participate in the final clustering for the centers identified by random walk can best represent the spatial position of the clusters.
• Establishing similarity matrix: The similarity among points is the basis of spatial clustering. A similarity
matrix among all the initial centers is established for clustering. The similarity s(i, j) is defined using the negative Euclidean distance between point i and j. To generate a moderate cluster number, we set the preference, i.e. selfsimilarity, to half of the median of all the similarities. • Message propagating: Two kinds of messages representing the affinity, i.e. “responsibility” and “availability”, are defined and recursively propagated till refined clusters emerge. The availability is initialized to zero while the responsibility is initialized and updated as:
r (i, k) = s (i, k) max {s (i, k)} ,
(1)
k ̸=k
where k´ means the other candidate centers except k. The
availability is updated by:
a (i, k) = min 0, r (k, k) +
max {0, r (i, k)} ,
i ∈/ {i,k}
(2)
where i´ means the other candidate centers except i, k.
Specially, the self-availability is updated differently as:
a (k, k) = max {0, r (i, k)} .
(3)
i ̸=k
• Damping factor: To avoid numerical oscillations arising in unexpected circumstance, we add a damping factor to the messages in every iteration:
r (i, k) = (1 λ) r (i, k) + λrold (i, k) , (4)
a (i, k) = (1 λ) a (i, k) + λaold (i, k) . (5)
where λ is the damping factor between 0 and 1. We set λ to 0.9 in this paper. rold (i, k) and aold (i, k) are the messages in the previous iteration. • Identifying final clusters: When the message propagation accomplishes, the convergent matrices of r and a are added together to form an evidence matrix E. We extract the diagonal elements of E to determine the clustering result. The point k whose corresponding E(k, k) > 0 will be chosen as an exemplar. Meanwhile, non-exemplar points will be assigned to the cluster centralized with the exemplar that has the largest similarity with them.
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on December 11,2024 at 09:47:05 UTC from IEEE Xplore. Restrictions apply.
While the centers belonging to the same AOI are clustered into one group, the initial fixation clusters represented by the centers will also be merged correspondingly.
B. Identifying Visual Attention Centers
From the clustering process, eye fixations are divided into clusters around different AOIs. On each fixation cluster, we conduct random walk again to figure out its final center, which represents the visual focus. The identification is premised on the assumption that a center is surrounded by a large percentage of fixations of high consistency with each other [10]. Random walk assigns a coefficient to each fixation according to its potential approximation and transmits it to the neighbors given a high consistency. Compared with mean or density based methods in single dimension, random walk combines both spatial and temporal cues to locate the final centers.
Fig. 2. Experimental results on patterns. The first row is the result of initial clustering with I-VT. The second row is the final result of our method. Fixations of different clusters are marked out with asterisks of different colors. The centers are represented with red dots. The grey crosses show the ground truth.
• Defining transition probability: The transition probability q(i, j) from point i to j is calculated using the equation:
q(i, j) = ∑nke=1σe×Dσ(×i,Dj)(i,k) ,
(6)
where D(i, j) is the Euclidean distance from i to j. σ
makes a subtle adjustment to the distribution of centers
and the denominator normalizes the probability. σ is set to 0.08 here. The transition probability reflects the approximate probability between every two fixations. A
Fig. 3. The magnified partial view of the result in Fig. 2. The identified centers are marked out with red dot (our method), green triangle (Tobiis default method), black diamond (K-means) and magenta dot ([11]).
farther distance leads to a smaller approximate probability
and vice versa. • Integrating fixation density: For each fixation, its coeffi-
III. EXPERIMENTS
cient is initialized using the density of relevant fixations,
which is integrated on the basis of tracking duration. The
coefficient is obtained by normalizing the density.
• Updating coefficients with random walk: Random walk
recursively updates the coefficients using the transition
probability of fixations. To reduce the input errors, a
damping factor is added to the process:
1 ∑n
lt+1(i)
=
( η
(1 (1 α)lt(i))lt(j)q(j, i)
j=1
(7)
+(1 α)lt(i)w(i)),
A. Experiment Setup
We collect the eye tracking data with a Tobii X120 Eye Tracker. The tracker is at a distance of 1 meter from the subject, tilt for 30 degrees and placed in front of a 27-inch computer monitor that presents the stimuli. Each stimulus is viewed for about 10 seconds. The results, including coordinate, duration and recording moment of each eye movement point, are recorded at 120Hz. To comprehensively verify the method, we set two experimental scenes and also compare the consequences with some existing algorithms.
where lt(i) means the coefficient of fixation i in iteration B. Experiments on Different Patterns t. The damping factor is expressed as (1 α)lt(i)w(i). Three patterns are used for validation, in which the centers α is set to 0.5. η is the parameter that normalizes the are marked out as ground truth. The subject is asked to fix
coefficient.
attention on the centers of the patterns. As is shown in Fig. 2,
∑n ∑n
the proposed algorithm avoids the interference of the cluster
η = ( (1(1α)lt(i))lt(j)q(j, i)+(1α)lt(i)w(i)). overlapping and obtains reasonable results.
i=1 j=1
(8)
Iteration terminates while reaching convergence that the coefficient lt+1 is equal to that of the previous iteration. • Identifying fixation centers: Finally, We obtain the center
TABLE I COMPARISON OF ABSOLUTE PIXEL DEVIATION OF
DIFFERENT METHODS.
(xˆ, yˆ) of a certain fixation cluster by calculating the mean
fixation position weighted by the final coefficient lT ,
which is obtained from the updating process.
{
xˆ yˆ
= =
∑n ∑ni=1
i=1
xilT (i) yilT (i).
(9)
Pixel deviation
K-means based method
13.0822
Tobiis default method
10.8432
Sˇ pakovs method [11]
12.6322
Our method
8.4430
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on December 11,2024 at 09:47:05 UTC from IEEE Xplore. Restrictions apply.
Fig. 4. Experiment on natural images. The results of different methods are shown in corresponding columns.
We also conduct other methods including K-means based method, density based method in [11], and Tobiis default method utilized in Tobii eye tracker software. In Fig. 3, the results of these methods on a single center are shown to illustrate the advantage of our method. We can see that our algorithm locates the center with the smallest deviation. For the Tobiis method, two mistaken centers are marked out on account of merely considering temporal factor. The results of K-means and density based methods distinctly deviate from the ground truth. Table I lists the absolute pixel deviation from the ground truth in the partial view, which quantitatively shows the advantage of the proposed method.
C. Experiments on Natural Images
To validate the proposed method in practical eye-tracking applications, we implement the algorithms on natural images from [12]. We can see from Fig. 4 that the proposed method successfully handles the messy overlapping fixations and identifies one center on each AOI, while Tobiis default method faces failure of generating too many unnecessary centers. In comparison, the results of density based method and K-means based method are similar to ours. However, the density based method is easily influenced by the outliers around the AOIs and deviates towards the outliers. K-means based method is quite sensitive to the initialization so that it has to be rerun at least three times to get an acceptable result and the number of clusters has to be set manully for each image.
IV. CONCLUSION
In this paper, we present a comprehensive algorithm of visual attention identification, which is based on clustering and visual attention center identification. On the eye tracking dataset, fixation clusters are generated with our proposed spatial-temporal affinity propagation clustering method. On each cluster, random walk based method is conducted to identify the corresponding center. The algorithm solves the problem of overlapping in eye movement analytics and achieves a more accurate center identification result. In the experiments, we verify the effective performance of our algorithm with
two experiments. In comparison with other methods, the
proposed visual attention identification algorithm outperforms
in accuracy and robustness.
ACKNOWLEDGMENT
This work was supported in part by National Natural Sci-
ence Foundation of China (No. 61471273), National Hightech
R&D Program of China (863 Program, 2015AA015903), and
Natural Science Foundation of Hubei Province of China (No.
2015CFA053).
REFERENCES
[1] L. Mason, P. Pluchino, and M. C. Tornatora, “Eye-movement modeling of integrative reading of an illustrated text: Effects on processing and learning,” Contemporary Educational Psychology, vol. 41, pp. 172 187, 2015.
[2] C. J. Erkelens and I. M. Vogels, “The initial direction and landing position of saccades,” Studies in Visual Information Processing, vol. 6, pp. 133144, 1995.
[3] D. D. Salvucci and J. H. Goldberg, “Identifying fixations and saccades in eye-tracking protocols,” in Proceedings of the 2000 Symposium on Eye Tracking Research & Applications. ACM, 2000, pp. 7178.
[4] A. Likas, N. Vlassis, and J. J. Verbeek, “The global k-means clustering algorithm,” Pattern Recognition, vol. 36, pp. 451461, 2003.
[5] T. Urruty, S. Lew, C. Djeraba, and D. A. Simovici, “Detecting eye fixations by projection clustering,” ACM Transactions on Multimedia Computing Communications and Applications, vol. 3, pp. 120, 2007.
[6] A. Bouguettaya, Q. Yu, X. Liu, X. Zhou, and A. Song, “Efficient agglomerative hierarchical clustering,” Expert Systems with Applications, vol. 42, pp. 27852797, 2015.
[7] C. Privitera and L. Stark, “Algorithms for defining visual regions-ofinterest: comparison with eye fixations,” IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 22, no. 9, pp. 970982, 2000.
[8] S. D. Kro¨nig and E. A. Buffalo, “A nonparametric method for detecting fixations and saccades using cluster analysis: Removing the need for arbitrary thresholds,” Journal of Neuroscience Methods, vol. 227, pp. 121131, 2014.
[9] B. J. Frey and D. Dueck, “Clustering by passing messages between data points,” Science, vol. 315, pp. 972976, 2007.
[10] A. R. Zamir, S. Ardeshir, and M. Shah, “Gps-tag refinement using random walks with an adaptive damping factor,” in IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 2014, pp. 42804287.
[11] O. Sˇ pakov and D. Miniotas, “Application of clustering algorithms in eye gaze visualizations,” Information Technology and Control, vol. 36, no. 2, pp. 213216, 2007.
[12] T. Judd, K. Ehinger, F. Durand, and A. Torralba, “Learning to predict where humans look,” in IEEE 12th International Conference on Computer Vision, 2009, pp. 21062113.
Authorized licensed use limited to: Technische Informationsbibliothek (TIB). Downloaded on December 11,2024 at 09:47:05 UTC from IEEE Xplore. Restrictions apply.

View File

@@ -0,0 +1,15 @@
Title: Revisiting visual attention identification based on eye tracking data analytics
Subject: 2016 Visual Communications and Image Processing (VCIP);2016; ; ;10.1109/VCIP.2016.7805537
Author: Yingxue Zhang
Creator: 'Certified by IEEE PDFeXpress at 09/04/2016 8:47:13 PM'
Producer: MiKTeX-dvipdfmx (20110311); modified using iText® 7.1.1 ©2000-2018 iText Group NV (AGPL-version)
CreationDate: 09/05/16 11:12:59
ModDate: 12/07/18 07:25:46
Tagged: no
Form: none
Pages: 4
Encrypted: no
Page size: 595.28 x 841.89 pts (A4) (rotated 0 degrees)
File size: 1851420 bytes
Optimized: no
PDF version: 1.4

View File

@@ -0,0 +1,245 @@
INHALTSVERZEICHNIS
DIE HOCHSCHULE ZITTAU: GRUSSWORT UND REGIONAL SESSION.........................................................11
GRUSSWORT DES REKTORS DER HOCHSCHULE ZITTAU-GÖRLITZ Kratzsch, Alexander..............................................................................................................................................13
ZITTAU - IMMER EINE (KONFERENZ-) REISE WERT! Reiche, Karl-Heinz................................................................................................................................................15 DIE SICHT DER BEVÖLKERUNG AUF DIE ZUKUNFT DER LAUSITZ Klemm, Viktoria.....................................................................................................................................................17
DIE BEDARFE AN BERUFLICHEN WEITERBILDUNGEN IN DER REGION OBERLAUSITZ Winkler, Daniel; Keil, Sophia; Lindner, Fabian; Mühlan, Kevin; Przybysz, Kazimierz Adam.................. 21
KEYNOTES..............................................................................................................................................................27
AKKREDITIERUNG 2021 - AKTUELLE FRAGEN UND PROBLEME Bargstädt, Hans-Joachim; Grünes, Andreas................................................................................................... 29
WOHIN ÄNDERT SICH DIE ARBEITSWELT? Hirsch-Kreinsen, Hartmut...................................................................................................................................39
„FLIPPED CLASSROOM" IN MATERIALWISSENSCHAFTEN - DER DIDAKTISCHE MEHRWERT?
PRAXISERFAHRUNGEN
Pfennig, Anja
47
EINFÜHRENDE REFERATE...............................................................................................................................57
ZUM „PURPOSE" DER INGENIEURAUSBILDUNG; PURPOSE ALS HOCHSCHULDIDAKTISCHE KATEGORIE -WELCHE CHANCEN UND RISIKEN SIND FÜR INGENIEURSTUDIENGÄNGE ABLEITBAR?
Dreher, Ralph
59
HOWTO MAKE AN ENGINEER - NACHWUCHSARBEIT ZWISCHEN MARKT UND MISSION Fislake, Martin........................................................................................................................................................ 69
KONSTRUKTIVISTISCHE FACHDIDAKTIK DER ELEKTROTECHNIK - INHALTLICHE ASPEKTE Jambor, Thomas N................................................................................................................................................79
INGENIEURPÄDAGOGIK IM KONTEXT AKADEMISCHER UND BERUFLICHER BILDUNG Kersten, Steffen....................................................................................................................................................89
KÜNSTLICHE INTELLIGENZ UND ETHIK........................................................................................................ 99
ALGORITHMENETHISCHE FRAGESTELLUNGEN IM BILDUNGSKONTEXT Antony, Lea.......................................................................................................................................................... 101
KÜNSTLICHE INTELLIGENZ UND MORAL MISFORTUNE? ODER: WER ÜBERNIMMT DIE
VERANTWORTUNG FÜR MORALISCH ILLEGITIME OPERATIONEN EINES KI-SYSTEMS?
Greger, Timo
107
DER EINSATZ VON Kl IN BILDUNGSPROZESSEN - HERAUSFORDERUNGEN UND CHANCEN AM
BEISPIEL INTERAKTIVER DIGITALER ZEUGNISSE
Heindl, Fabian
113
,HEY SIRI, PAINTA PICASSO!'- KANN Kl KUNST?
Winter, Dorothea..................................................................................................................................................119
WORKSHOPS ......................................................................................................................................................125
GENDER, HETEROGENITÄT UND KULTURELLE VIELFALT IN DER TECHNISCHEN BILDUNG:
DIFFERENZIEREN, INDIVIDUALISIEREN ODER INTEGRIEREN?
Dederichs-Koch, Andrea; Pieper, Justinus
127
ETHIK UND TECHNISCHES HANDELN - EINE VERHÄLTNISBESTIMMUNG ZWEIER ASPEKTE DES MENSCHLICHEN LEBENSVOLLZUGS Diebel-Fischer, Hermann....................................................................................................................................135
ASPEKTE UND KRITERIEN ERFOLGREICHER BINATIONALER ZUSAMMENARBEIT IM BEREICH TUTOR/INNENQUALIFIZIERUNG Hölscher, Meike; Weston, Anna; Mölken, Oliver; Bock, Silke; Rausenberger, Julia.............................. 141
BETRIEBLICHE KOMPETENZENTWICKLUNG IN DIGITALEN ZEITEN(NEUER) REFLEXIONSBEDARF FÜR WEITERBILDENDE FACHKRÄFTE? Kukuk, Andre.........................................................................................................................................................149
REFERATE.............................................................................................................................................................157
I DER GROSSE SCHRITT INSSTUDIUM.............................................................................................................159
STUDIENSTART ELEKTROTECHNIK IN COVID-19 ZEITEN Neukamp, Till; Betz, Thomas............................................................................................................................161
DIGITALE LEHRANGEBOTE MATHEMATIK IN DER SCHUL- UND STUDIENEINGANGSPHASE
INGENIEURWISSENSCHAFTLICHER STUDIENGÄNGE
Kreis, Oliver; Nasarow, Alexander
167
6
II BERUFLICHE BILDUNG / INGENIEURPÄDAGOGIK - KONZEPTE........................................................... 175
NEUE WEGE IN DER INGENIEURWISSENSCHAFTLICHEN BILDUNG AM BEISPIEL DER ELEKTROTECHNIK Block, Brit-Maren.............................................. :.................................................................................................177
DER (UNTERSCHÄTZTE) EINFLUSS EINER VERNETZTEN, INTERDISZIPLINÄREN LEHRE. EINE EMPIRISCHE STUDIE IN DEN INGENIEURWISSENSCHAFTEN Dumschat, Markus; Bjekovic, Robert; Stetter, Ralf; Mischo-Kelling, Maria; Rottmann, Joachim; Schweizer, Phileas................................................................183
FÖRDERUNG INTERKULTURELLER KOMPETENZEN IN DER „INGENIEURE OHNE GRENZEN CHALLENGE"
Frye, Silke; May, Dominik; Haertel,Tobias
189
UNTERRICHTSKONZEPTE ZUR THEORIE-PRAXIS-VERBINDUNG IM BERUFSFELD ERNÄHRUNG Horlacher, Franz.................................................................................................................................................195
KOGNITIVE UNTERSTÜTZUNGSSYSTEME ZUR ERHÖHUNG DER HANDLUNGSSICHERHEIT IN
KOMPLEXEN ANFORDERUNGSSITUATIONEN - EIN DISZIPLINÜBERGREIFENDER ANSATZ
Köhler, Marcel
203
ENGINEERING FUTURE SKILLS: REFLEXIONSBASIERT PRÜFEN IN DEN INGENIEURWISSENSCHAFTEN Richert, Anja; Varney (geb. Stehling), Valerie; Thoma, Aniella.........................................................................211
BASISKONZEPTE UND FUNDAMENTALE IDEEN - EIN BLICK AUF DIE SITUATION IN DER ELEKTROTECHNIK
Stender, Birga; Krugei, Johannes
217
III ONLINE-LEHRE - AUGMENTED, ASSISTED UND VIRTUAL REALITY....................................................225
AUGMENTED REALITY - KOMPETENTES AGIEREN IN BERUFLICHER BILDUNG UND PRAXIS
Agus, Mattia; Jaschke, Steffen; Kuhnhen, Christopher; Langhammer, Kay;
Menzel, Mareike; Riehle, Tamara; Schulte, Sven; Schuster, Peter; Wepner, Kim
227
VOM ZAUBER DES ANFANGS- DEN EINSTIEG IN ONLINE-LEHRVERANSTALTUNGEN BEWUSST GESTALTEN Berbuir, Ute; John, Magdalena; Wedler, Kevin..............................................................................................235
ERWACHSENENBILDUNG POTENTIAL-ANALYSEN BLENDED LEARNING APPROACH IM AGILEN ARBEITSUMFELD Burchardt, Carsten...............................................................................................................................................243
PROGRAMMIEREN LERNEN - WEB-BASIERT UND OFFEN Burke, Bruno; Möller, Sebastian; Krings, Luise; Schinzel, Sebastian; Vennemann; Peter....................249
KOMPETENZEN IN DER INGENIEUR(AUS)BILDUNG IN EINER DIGITALEN WELT-VON DIVER­
GIERENDEN ERWARTUNGEN UND ANFORDERUNGEN ZU NEUGESTALTUNGSPERSPEKTIVEN
Cwielong, Ilona Andrea
255
HYBRID-KLAUSUR-AUTOMATISIERTES PRÜFEN KOMPLEXER BERECHNUNGSAUFGABEN
FÜR EINE EFFIZIENTE KLAUSURAUSWERTUNG IM MINT-BEREICH
Freudenreich, Ronny; Herrmann, Sebastian
263
7
INVERTED CLASSROOM - WIE GEHT ES MIT ÜBERSCHAUBAREM AUFWAND Geike, Thomas......................................................................................................................................................269
ANALYSE KRITISCHER SITUATIONEN IM ONLINE-TEACHING IM LOCKDOWN - ABLEITUNG VON ANFORDERUNGEN UND HANDLUNGSEMPFEHLUNGEN FÜR LEHRENDE UND LERNENDE Görl-Rottstädt, Dörte; Köhler, Marcel; Heinrich-Zehm, Michael; Arnold, Maik; Hähnlein, Vera 275
VON PRÄSENZ INS DIGITALE UND ONLINE - UND DABEI DEN MENSCHEN AM BILDSCHIRM
NICHT AUS DEM BLICK VERLIEREN
Grökel, Anne; Schauer, Daniela
283
EINSATZ VON REMOTE UND VIRTUAL REALITY LABS ZUR AKTIVIERUNG GROSSER KOHORTEN Haack, Matthias; Jambor, Thomas N..............................................................................................................289
PRÄSENTATION EINER COMPUTERBASIERTEN KFZ-SIMULATION ALS LEHR-LERN-UMGEBUNG Hesse, Peter; Meier, Julius; Abele, Stephan; Glogger-Frey, Inga................................................................295
AUGMENTED, ASSISTED UND VIERTUAL REALITY IN DER (BERUFLICHEN) BILDUNG Köhler, Sebastian; Horey, Florian; Jambor, Thomas.....................................................................................305
DER MENSCH IM MITTELPUNKT VON ARBEITEN UND LERNEN MIT INDUSTRIAL AUGMENTED REALITY
Mühlan, Kevin; Przybysz, Kazimierz Adam; Lindner, Fabian; Winkler, Daniel; Keil, Sophia
313
ZURÜCK IN DEN HÖRSAAL, ZURÜCK ZU MEHR INTERAKTION - DANK HYBRIDEN
LEHRFORMATEN AUF GRUNDLAGE DES CONSTRUCTIVE ALIGNMENT
Nagengast, Valentin; Liebschner, Marcus; Erhardt, Christina
321
KONZEPT ZUR DATENGESTÜTZTEN UNTERSTÜTZUNG BEIM AUFBAU VON
KORREKTEN KOGNITIVEN MODELLEN
Paehr, Johannes; Jambor, Thomas N
327
KONZEPT ZUR UNTERSUCHUNG DER AKZEPTANZ VON MIXED-REALITY-BASIERTEN LERNKONZEPTEN Schmitt, Bianca; Petersen, Maren................................................................................................................... 333
POSTER..................................................................................................................................................................339
KOOPERIEREN LERNEN IM BAUINGENIEURSSTUDIUM Block, Marlena; Embers, Stephan; König, Markus........................................................................................341
COOLMINT - TECHNIKBEGEISTERUNG DURCH VIELFÄLTIGE ANGEBOTE
FÜR SCHÜLERINNEN UND SCHÜLER
Thevapalan, Edmond; Temmen, Katrin
345
INFORMATIONSTECHNOLOGIEN DER ZUKUNFT - VIDEO- UND AUGMENTEDREALITY-BASIERTE MONTAGEANLEITUNGEN FÜR DIE TECHNISCHE BILDUNG Winkler, Daniel; Lindner, Fabian; Mühlan, Kevin; Przybysz, Kazimierz Adam; Keil, Sophia............... 349
8
PRE-CONFERENCE ..............................................................................................................................................355
LERNKATALYSATOREN UND KONVERGIERENDE OBJEKTE Ahrens, Volker; Hieronymus, Martin.............................................................................................................357
TANDEM TEACHING IN DER INGENIEURPÄDAGOGISCHEN AUSBILDUNG Al-Diban, Sabine.................................................................................................................................................365
FACILITY MANAGEMENT LEHREN MIT AUGMENTIERTER UND VIRTUELLER REALITÄT -
EINE BEDINGUNGSANALYSE
Klein-Wiele, Judit; Privenau, Jacqueline; Raab, Lisa
373
DIGITALE KOMPETENZEN FÜR ZUKÜNFTIGE WIRTSCHAFTSINGENIEURINNEN UND -INGENIEURE
- EIN STAKEHOLDERRANKING MITHILFE DES ANALYTISCHEN HIERARCHIEPROZESSES (AHP)
Lindner, Fabian; Winkler, Daniel; Mühlan, Kevin; Keil, Sophia
379
PROSUMAGE IN DER INGENIEURPÄDAGOGIK Meyer-Ross, K. Kathy.......................................................................................................................................... 385
EIN SERIOUS GAME MIT LERNENDEN- UND OBJEKTZENTRIERTER AUFGABENGESTALTUNG ALS DIGITALE LÖSUNG BEIM ÜBERGANG ZUR UNIVERSITÄREN INGENIEURAUSBILDUNG
Müller, Thomas; Seidel, Anna; Weidle, Franziska; Dubrau, Marlen; Börner, Claudia
393
EINE HOCHSCHULÜBERGREIFENDE ENTWICKLUNG VON ONLINE­ STUDIENORIENTIERUNGSFORMATEN AUF BASIS VON ZIELGRUPPENSPEZIFISCHEN ANALYSEN AM BEISPIEL VON INGENIEURWISSENSCHAFTLICHEN STUDIENGÄNGEN Schulz, Marcel......................................................................................................................................................399
9

View File

@@ -0,0 +1,14 @@
Title: Inhaltsverzeichnis zu ""
Subject: Inhaltsverzeichnis
Creator: ABBYY Finereader Server; modified using ExifTool 11.33
Producer: ABBYY FineReader Server; modified using iTextSharpTM 5.5.13 ©2000-2018 iText Group NV (AGPL-version)
CreationDate: 08/15/22 13:52:47
ModDate: 08/15/22 16:35:16
Tagged: yes
Form: none
Pages: 5
Encrypted: no
Page size: 595.2 x 850.3 pts (rotated 0 degrees)
File size: 216117 bytes
Optimized: no
PDF version: 1.4

View File

@@ -0,0 +1,219 @@
© 2018 Uniwersytet Warszawski/ University of Warsaw. Wydanie w otwartym dostępie na licencji CC BY-NC-ND (https://creativecommons.org/licenses/by-nc-nd/4.0/deed.pl). This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).
Applied Linguistics Papers 25/4, 2018, 101116
The First Hundred Years: a History of Eye Tracking as a Research Method
Monika PŁUŻYCZKA Uniwersytet Warszawski/ University of Warsaw E-mail: mpluzyczka@uw.edu.pl,
Abstract: The paper describes the first hundred years of the history of eye tracking as a research method, dividing it into three phases of development. It starts by presenting the studies on tracing eye movements in reading in the end of the 19th century and the creation of the first eye trackers. The further part presents how the eye tracking technology was improved in the time of film recordings, ceasing to be invasive for the eyes. It also shows how in this time the main focus of research shifted to practical aspects due to the development of the behaviourist movement in experimental psychology. The third phase starts in the 1970s, when researchers turned more towards the dependence between the perception and mental processes. It was linked to the establishment of a theoretical and methodological basis for cognitive psychology.
Keywords: eye tracking, research, history, technology, eye movements, reading process, development, eye trackers
1.
The technological progress and increasing dissemination of research equipment (such as eye trackers, EEG) in disciplines not related to medicine, as well as more and more easier access to state-of-the-art devices (e.g. fMRI) have opened up new cognitive possibilities to humanities. Linguistics, language didactics and translation studies, too, use increasingly the technological possibilities to extend their boundaries of cognition. One of such technological method in translation studies (in use in this academic discipline for about 12 years) is eye tracking.
Eye tracking devices allow to trace the eye movement, whereas a special software enables to process the data in such a way that it becomes possible to interpret the data scientifically. At the foundation of this kind of research lies the assumption that there is a correlation between eye movements and particular mental processes. The eye, i.e. sight, is for humans one of the most important senses. Nearly 80% of all sensory impressions are delivered to the brain via the visual channel. Vision provides also information at the highest level of speed. The speed of data transmission is measured in bits per second, various senses transfer information at the following speed: eyes (sight) provide information at a speed of 10 million bit/s, skin (touch) one million bit/s, ears (hearing) 100,000 bit/s, nose (smell) 100,000 bit/s, tongue (taste) 1000 bit/s (quoted from: V. Gollücke 2009). Moreover, 10% of the cerebral cortex is involved in the interpretation of visual information. The visual
Applied Linguistics Papers: www.alp.uw.edu.pl
Monika PŁUŻYCZKA
102
cortex, in turn, accounts for about 60% of the whole cerebral cortex, taking into account all areas responsible for responses to visual stimuli. This means that most of the information (and in the fastest way) man gathers from surrounding world is received by means of sight. Furthermore, visual activity is the easiest one to examine, after verbal and motoric operations, when it comes to the whole range of human behaviour. This is yet another reason for the rising popularity of eye tracking studies.
It should not surprise then that also translation studies attempt to increasingly use the achievements of eye tracking. The use of these achievements in translation studies enables us to collect interesting data on how we perceive visually during translation, and consequently also on how we process information during the translation act and what mental processes are involved in the act.
In this article I will address the history of eye tracking studies, focusing in particular on the area of greatest interest to me, namely on studies regarding text perception.
2.
Eye tracking seems to be a modern technology, however, its roots can be traced back to the 19th century, when regular studies on the reading process began (G. Bente 2005). At the beginning researchers studied eye movements without measuring equipment, based on normal observations. A mirror was placed on the pages of the book read by the test subject, behind whose back stood the experimenter who observed the movement of the test subjects eyes in that mirror. Of course, that method was far from being precise, nonetheless it yielded some interesting conclusions regarding the visual perception during reading.
As to who was first to conduct studies on tracking eye movement during reading, there is some inaccuracy. Most sources report that the first researcher to have described the movement of the eyes during reading was French ophthalmologist Louis Émile Javal, at the end of the 1870s and these years are also considered the beginning of the era of eye tracking studies. The scientific experiments led L.É. Javal to the conclusion in 1879 that reading did not require eyes to move continuously along a line of text, as had been assumed previously. He proceeded to explain that reading was not a linear process, meaning that the test subjects eyes did not move continuously during the passage over each line but that it is more a process consisting in short rapid movements (saccades) intermingled with pauses, or short stops of the eyes (fixations) on concrete elements.
In the article “Did Javal Measure Eye Movements during Reading?” (2009) N.J. Wade and B. W. Tatler challenged the proposition that it had been L.É. Javal who was the first to carry out and describe such tests. They determined and it seems they were right in doing so that L.É. Javal had been attributed the achievements of M. Lamarea1. This mistake was due to the misinterpretation of words of E.B. Huey
1 As stressed by the authors, in those times many undertook to study eye movements, including such scholars as: E. Rählmann (1878), A. Ahrens (1891), E. Landolt (1891), M.H.E.
Applied Linguistics Papers: www.alp.uw.edu.pl
The First Hundred Years …
103
who had written:
Lamare, working with Javal, finding that the movement of the eye in reading was not continuous, but by little jerks (par saccades), devised the following method for counting these: A blunt point placed on the upper eyelid of the reader put in action a microphone, whose sound, transmitted by a rubber tube, made known each movement to the ear of the experimenter the short reading jerks causing a brief sound, while the extensive movements made in passing from the end of the line to the commencement of the next, caused a more prolonged sound (E.B. Huey 1900: 285).
The authors of the article stress that he really meant the research of M. Lamare who had been conducting studies in L.É. Javals laboratory. These experiments were actually described by L.É. Javal himself:
Following the research of M. Lamare in our laboratory, the eye makes several saccades during the passage over each line, about one for every 1518 letters of text. It is probable that in myopes the eye reacts with a rapid change in accommodation with every one of these saccades (L.É. Javal 1879: 252).
The supposition of the authors is confirmed by L.É. Javals own words as he always stressed in his works that it was M. Lamare who carried out the studies that indicated that the eye movement in the reading process was not linear (see also: L.É. Javal 1905: 127).
L.É. Javal, in turn, wrote on the basis of his observations that he did not notice vertical eye movements during reading, and this argument actually stands in contrast to words attributed to him:
(…) gaze glides along a line slightly higher than the centre of the characters. The reason for this is easy to see: if gaze simply glides horizontally, complicated and useless movements are avoided, and the chosen position of the horizontal is determined by the structure of the typographic characters2 (L.É. Javal 1878: 251).
Doubtlessly, L.É. Javals contribution was the first-time use of the term “saccades”. L.É. Javal counted those saccades by attaching a microphone to the upper eyelid of the reader. When the readers gaze slid along the text, it was possible to record the eye movement next to the microphone (J. Grobelny et al. 2006).
Little is known about M. Lamare besides the above-mentioned citations. He failed to publish anything for over a decade after carrying out his experiments. He later described several methods he had been using in order to count and analyse saccades during reading. However, as he was not satisfied with his observations of eye movements, he created a device that would be able to record the eye jerks and stops. He placed a blunt point on the eyelid; that point “captured” the sound created by each saccade, transferring it as a soft snap to the experimenters ear, via a drum with an ebonite membrane in the centre and to which a small tube was attached:
Tscherning (1898). A. Ahrens attempted to examine the eye movements using a lens made of ivory and hair attached to it. While he was not able to go through with his idea, it was later used by E. B. Delabarre (1898) and E. B. Huey (1898, 1900).
Applied Linguistics Papers: www.alp.uw.edu.pl
Monika PŁUŻYCZKA
104
The method that gives the best results is one by which the movements are heard via a drum with an ebonite membrane in the centre and to which a small tube is attached; the tube is in contact with the conjunctiva or eyelid and is connected to both ears by rubber tubes… The apparatus yields distinctive sounds which an assistant can count and add, and note for each line. The return movement of the eyes to the beginning of a line gives a longer and louder noise that is easy to recognise; one counts the number of saccades from the start of the line to be able to note the number of divisions that occur in a line (M. Lamare 1892: 357; italic print by the author).
M. Lamare was the first to apply a mechanical device, arguing that saccades were easier to register by sound than by sight, which now seems quite a surprising assertion.
A regular examination of eye movements was commenced by E. Huey (1898). He, too, is considered the creator of the first eye tracker. E. Huey used a sort of contact lens with an opening for the iris. That lens was connected with an aluminium indicator which showed the eye movement (regression factor). This method, however, was so invasive to the eye that E. Huey resorted to giving his test subjects cocaine in order to reduce their discomfort during the experiment.
Around the same time (1898) a similarly invasive mechanical method to study eye movements has been worked out by E.B. Delabarre. He used a cap made of gypsum which adhered to the moist surface of the eye. Attached to this cap was a wire that led to a lever which drew horizontal movements of the eye on the sooted surface of a cinematographic cylinder. The test subject (usually E.B. Delabarre himself) was able to read a text through a hole made in the gypsum cap. The eye was anaesthetised with cocaine. The gypsum cap did not detach from the eye until it started filling with tears. It should be noted that E.B. Delabarre himself was unable to determine whether the method was safe for the eye. He merely asserted that after recording eye movements for over an hour, he recover within a week.
The first non-invasive and precise eye tracker, in turn, was created in 1901 by Americans R. Dodge and T.S. Cline who had started the phase of optical eye trackers. They were the first to use light that reflects from the surface of the cornea and falls through an optical system onto a moving photosensitive photographic plate, thus leaving a record of the eye movement on that plate. Their device, called “The Dodge Photochronograph” (it was used mostly by R. Dodge and his collaborators), was a breakthrough in the development of eye tracking technology and it made devices using the reflection of light from the cornea popular. However, R. Dodges and T.S. Clines device had two drawbacks: firstly, it registered only horizontal movements of the eyes and it required test subjects to keep their heads still. Nonetheless, thanks to these studies it was discovered that a person does not receive information from the surrounding world at the same time as saccadic movements occur.
Applied Linguistics Papers: www.alp.uw.edu.pl
The First Hundred Years …
105
Photograph 1. R. Dodge next to the eye tracker designed by him (A.R. Diefendor/ R. Dodge 1908: 462).
The photographic plates used by R. Dodge and T.S. Cline were next replaced by photographic tape.
The next breakthrough in eye tracking research came about with the construction of a device that was able to record both horizontal and vertical eye movements. It was presented a few years later, in 1905, by Ch.H. Judd, C.N. McAlister and W.M. Steel. They placed a small mechanical indicator on the test subjects eye which reflected a light spot. Depending on the position of the eye the light spot depicted the eye movements on a photosensitive tape. The advantage of that method was that there was no mechanical contact with the eye, but it had one drawback, just like the apparatus of R. Dodge and T.S. Cline, i.e. it required the test subjects head to remain still for the device to register changes.
The initial era of eye tracking research yielded results which laid the foundations for subsequent decades of studies and which created a data base on eye movement, perception, seeing and looking. At the heart of research at that time were such issues as saccadic suppression (meaning that we do not receive information during a saccade), the saccade latency (i.e. the time we need to initiate eye movement) and the vision span (i.e. the effective visual field) (see K. Rayner 1998).
The second era of eye movement research started with the time of film recordings in the 1920s. It initiated the development of eye tracking techniques. The new methods had one huge advantage, namely they did not necessitate invasive techniques affecting the test subject, they recorded both directions of eye movement and they provided an objective record of eye movements. However, in terms of experimental precision of the studies, this technique hardly differed from the original observation method.
Meanwhile, electric eye tracking technology was developing at the same time as the above methods. That technology was based on the occurrence of a difference of potentials between the back and the front part of the eye, resulting from the electric activity of the retina. In this method electrodes are placed next to the eye, usually on both sides, to enable the registration of changes of the potential as a result of eye movement (J. Grobelny et al. 2006). In 1922 E. Schott (1922) used the electrooculography which involves counting the varying values of electric potentials of the
Applied Linguistics Papers: www.alp.uw.edu.pl
Monika PŁUŻYCZKA
106
cornea and the retina. The use of this research method considerably improved the accuracy and credibility of results.
The second stage of eye tracking studies was the resultant of technological progress and the development of the behaviourist movement in experimental psychology. Consequently, at that time research on the eye movement focused usually on the practical aspect, that is the specifics of eye movement. Only a small part of that research was devoted to cognitive processes which these movements could have been indicative of:
Psychologists who studied eye movements and fixations prior to the 1970s generally attempted to avoid cognitive factors such as learning, memory, workload, and deployment of attention. Instead their focus was on relationships between eye movements and simple visual stimulus properties such as target movement, contrast, and location. Their solution to the problem of higher-level cognitive factors had been “to ignore, minimize or postpone their consideration in an attempt to develop models of the supposedly simpler lower-level processes, namely, sensorimotor relationships and their underlying physiology” (E. Kowler 1990: 1, quote from: R.J.K. Jacob/ K.S. Karn 2003: 575).
Important publications of the time on reading included: M.A. Tinker (1946, 1958), R.Y. Walker (1933), L.G. Stone (1941) and also studies of G.T. Buswell (1935, 1937). In 1930 Miles Tinker and his collaborators began using photographic techniques to study eye movement of readers. He also examined the impact of the font type and size as well as the look and page layout on the outcome and the reading speed as well as their influence on how eye balls move. In turn, G.T. Buswell (1935, 1937) created the first non-contact device registering eye movements. He used it to study the reading and watching of images.
Photograph 2 and photograph 3. Apparatus used for photographing eye movements (G.T. Buswell 1935: 12,13).
Buswells technique involved a method of separating light reflected from the cornea, which enabled a two-dimensional registration of movements of a single eye:
Applied Linguistics Papers: www.alp.uw.edu.pl
The First Hundred Years …
107
Photograph 4. Sample of film record by eye movement apparatus (G.T. Buswell 1935: 14)3.
The method consisted in photographing on a moving film the eye movements of a group of test subjects while they looked at series of pictures:
The apparatus was built for the particular purpose of this experiment in the workshop of the laboratories in educational psychology of the University of Chicago. Basically the apparatus is a large camera built in such a way that the two films can be moved continuously during the process of photographing. The various lenses and mirrors are simply for the purpose of bringing to a focus on the film the reflection of the tiny spot of light from the cornea of the eye. The light which reflects on the eye originates under the table. It is passed forward through a series of lenses and then upward through two holes in the table, after which it strikes two circular mirrors and is reflected to the subject's eyes. Instead of facing the camera lens, as has been necessary with previous pieces of apparatus of this sort, the subject is placed at right angles to the camera which gives him an open field of vision of whatever size is needed. Small pictures can be placed as close as the normal reading distance of twelve inches, whereas larger pictures can be set back whatever number of feet seems desirable. The provision of this larger field of vision adds a great deal of flexibility to the uses of the apparatus (G.T. Buswell 1935: 11).
Next, in 1947, P.M. Fitts and his collaborators started using a film camera to carry out eye tracking experiments (see P.M. Fitts/ R.E. Jones/ J.L. Milton 1950) in studies on the movement of eyes of pilots using controls in a cockpit and the instrument landing system.
Test subjects were released from immobilisation during tests only in 1948, by H. Hartridge and L.C. Thompson. They created a device that was put on the head of the test participant:
A new type of apparatus has recently been devised by Hartridge and Thomson, namely to hang from a suitable counterpoise a frame which carries the microscope, the reference lights and the corneal light source. It also carries a mouth plate which fits on to the teeth of the subject. The counterpoise is so arranged that the apparatus tends neither to rise nor to fall, but can be freely rotated in all directions. The optical apparatus is so disposed that the subject can observe uninterruptedly suitable fixation points of light which are at-
3 The upper record shows the horizontal movements of the eye and the lower one shows the vertical movements. The line marked E indicates eye movements and H presents record of head movements (G.T. Buswell 1935: 14).
Applied Linguistics Papers: www.alp.uw.edu.pl
Monika PŁUŻYCZKA
108
tached to a wall beyond the apparatus retains the advantage of the previous one, of enabling measurements of eye movements to be obtained which are quite independent of head movements (H. Hartridge and L.C. Thompson, 1948: 588589).
It was hardly a comfortable method for the test subject, however, it no longer required him to sit still in one place.
Schema 1. An eye tracker devised by H. Hartridge and L.C. Thompson (H. Hartridge/ L.C. Thompson 1948: 588).
A breakthrough in terms of comfort for subjects was achieved following the development of mobile eye tracker technology. The technology was perfected in the 1960s, by B. Shackel (1960) and N.H. Mackworth and E.L. Thomas (1962), to make them even less burdensome for the test subjects.
Suggested in 1958 by J.F. Mackworth and N.H. Mackworth, the method for registering eye positions in recordings of a scene watched by a test subject during the study represented another important discovery. Thanks to this innovation it became possible to observe eye movement concurrently with the result of that movement, in the form of gazing path of a given element. This eased the interpretation of eye tracking research considerably and it widened research possibilities.
Also, the 1960s brought about a return to the invasive methods of E. Huey and E.B. Delabarre. Scientists of the time came to the conclusion that a cap made of gypsum placed on the eye may be attached to the eye by way of suction. Research of Soviet biophysicist A. L. Yarbus4 garnered a lot of attention back then. He studied the movement of eyes using a specific device: a rubber suction cap with a mirror (later replaced with a radio antenna), attached by suction directly to the sclera of the human eye.
4 A.L. Yarbus = Альфред Лукьянович Ярбус this transcribed form of the Russian scientists name is known in English-speaking academic circles. It was adopted according to English transliteration rules, since Yarbus works were translated into English.
Applied Linguistics Papers: www.alp.uw.edu.pl
The First Hundred Years …
109
Picture 1. The suction device or “cap” (A.L. Yarbus 1967).
Light reflected from the mirror recorded eye movement on photographic paper. The head of the test subject was immobilised in a metal frame and during the experiment he had to keep biting a special plastic form which was attached to the metal frame and which constituted a cast of the test subjects teeth.
Photograph 5. Yarbus eye tracker (A.L. Yarbus 1967).
This atypical technique was not repeated by other researchers on account of its extreme invasiveness. The experiment could last only a few minutes on account of how extremely inconvenient it was to the subject, whereas data processing took weeks or months (see A.L. Yarbus 1965, 1967).
The highly contentious method allowed A.L. Yarbus to generate interesting results. The results of the experiment showed clearly a dependence of the gazing path on the task the subject was set, i.e. the goal set for the test subject determines the corresponding eye movement, specific perception and regard for specific elements.
A.L. Yarbus revealed also that when watching an image people look not only at the contours but above all at the so-called logic centre (e.g. a person or animal). To the recipients, faces of people mean more than the surrounding figures or conditions. On the other hand, when looking at a portrait, recipients focus mostly on the eyes,
Applied Linguistics Papers: www.alp.uw.edu.pl
Monika PŁUŻYCZKA
110
mouth and nose:
Граница и контур важны для появления зрительного образа, однако когда образ возник и видится непрерывно, у наблюдателя нет необходимости специально интересоваться границами и контурами. Граница и контур всего лишь элементы, из которых наряду с другими, не менее важными элементами складывается наше восприятие и узнавание предмета. Совершенно очевидно, что контур предмета будет привлекать внимание наблюдателя, если в самой форме контура заключены важные и нужные сведения (A.L. Yarbus 2002: 412)5.
This last remark was illustrated by A.L. Yarbus (2002) by means of the example of a sculpture representing Nefertiti, where the viewers whole attention during the experiment focused on the profile, the contour of the sculpture, as well as on its eyes:
Picture 1. Eye movement depiction for two-minute observation of Nefertiti sculpture (A.L. Yarbus 2002: 412).
Other researchers used special contact lenses fitted with coils; these lenses used the principle of electromagnetic induction: wires attached to the lens registered changes of power induced in the coils under the influences of eye movements in the electromagnetic field generated around the head (quoted from J. Grobelny et al. 2006). However, this method had one major disadvantage: data obtained in such a study could be accessed only after lengthy processing and the results were not visible on an ongoing basis.
The third phase of eye tracking research dates back to the mid-1970s. It coincided with two phenomena referring to:
 psychology, i.e. the establishment of a theoretical and methodological basis for cognitive psychology;
5 The borders and contours are important for the emergence of a complete picture or image, and yet once the image appears and it stays in sight, the observer no longer has to take special interest in the borders and contours. These borders and contours are just some of the elements which, alike others, no less important elements, affect our perception and recognition of the object. It is clear that the outline of an object will attract the observers attention if the same form of contours indicates an important and necessary information (A.L. Yarbus 2002: 412, own translation).
Applied Linguistics Papers: www.alp.uw.edu.pl
The First Hundred Years …
111
 technology i.e. the start of the use of computer and television technology and electronic techniques of detecting the eye and locating it.
This has influenced the character of the third phase of eye tracking research, during which (1) researchers turned more towards the dependence between the perception and mental processes transpiring at the same time in brain and (2) the use of the boom of eye tracking technology. The described stage coincided with the development of linguistic theories on language, language acquisition, text and language properties, language processing, translation etc. This, in turn, had an impact on stretching the range of eye tracking research onto linguistics, to cover among others text perception.
The 1960s saw the creation of the first eye tracking device resembling modern day equipment. It was made for the U.S. Air Force (J. Merchant 1966, 1969; J. Merchant et al. 1974). This device was called the “oculometer”.
Photograph 6 and 7. Honeywell oculometer (J. Merchant 1966).
Photograph 8. Improved oculometer devised by J. Merchant.
On the left side optomechanical unit, on the right electronics unit (J. Merchant, 1969: 1213).
Thanks to computer algorithms the iris was recognised on a video screen and its geometrical centre was then determined, as well as the direction in which the tested
Applied Linguistics Papers: www.alp.uw.edu.pl
Monika PŁUŻYCZKA
112
person was looking. This technology resembles modern-day computer-based eye trackers. Meanwhile, in medicine researchers still sometimes use devices based on electromagnetic induction or electrooculographic technology (quoted from: J. Grobelny et al. 2006).
The mobile eye tracker was actually devised simultaneously by two army research teams. The first, as mentioned above, had been commissioned by U.S. Air Force and was produced by Honeywell Corporation, while the other was created for the U.S. Army by EG&G Corporation. The two eye tracker models relieved the strain of the test subjects; moreover, they became a landmark of eye tracking technology, as they allowed for an automatic analysis of eye tracking data and a faster processing during while the experiment went on (see R.H. Lambert/ R.A. Monty/ R.J. Hall 1974, R.A. Monty 1975; J. Anliker 1976).
Important publications released in that time were based on the development of cognitive psychology and assumed results regarding that field as their starting point. Their authors attempted to develop theoretical models covering the correlation of fixations with particular cognitive processes (i.a.: M.A. Just/ P.A. Carpenter 1976a, 1976b, 1980, works by K. Rayner, i.a.: K. Rayner 1977, 1981, 1983, 1989, also group papers K. Rayner et al. 1989, 1976, 1986 etc., R.A. Monty/ J.W. Senders 1976, J.W. Senders/ D.F. Fisher/ R.A. Monty 1978, D.F. Fisher/ R.A. Monty/ J.W. Senders 1981). There have also been significant studies regarding the analysis of results of eye tracking research (i.a.: R. Kliegl/ R.K. Olson 1981, L.F. Scinto/ B.D. Barnette 1986) and also those concerning themselves with visual imaging in relation to the position of the eye (G.W. McConkie/ K. Rayner 1975, K. Rayner 1975b, S.M. Reder 1973).
An extremely interesting study on the matter, and probably also the most comprehensive one so far, summarising the time from the mid-1970s till 1990s of eyetracking research on reading processes, was drafted in 1998, by K. Rayner in his work “Eye Movements in Reading and Information Processing: 20 Years of Research”. The summary drafted by K. Rayner refers to the bigger part to research done by himself and his team, although it does also contain references to crucial achievements in eye tracking research on text perception, carried out by other scientists.
The next stage of eye tracking research started in the 1990s. It was triggered by a fast development of eye tracking technology and an equally sudden surge in computer operation and electronic data processing capacities. The devices became more tolerant of head movements of test subjects, so the head no longer had to be immobilised. Furthermore, the latest technological advancements expanded the range of application of eye tracking techniques. Eye tracking analyses were being used in an ever growing number of scientific fields and business sectors. Many companies dealing professionally in eye tracking studies appeared, offering services in the most diverse areas of life.
What is more, as scientific disciplines advanced and financing for research on universities rose (which corresponded to the ability to buy research equipment), eye tracking studies turned more and more into institutional studies, whole groups of
Applied Linguistics Papers: www.alp.uw.edu.pl
The First Hundred Years …
113
scientists, research laboratories, work groups at universities, all specialising in eye tracking, were established. This, in turn, resulted in a sudden spike in the number of various publications, which also contributed to further dissemination of the issue at hand and it sparked more interest in the matter. For instance, Danish firm iMotions, which creates eye tracking and biometric software, has carried out studies using the Internet search instrument Google Scholar, noting how the number of articles with the key work “eye tracking” increased since the 1970s6:
197074: 1975-79: 1980-84: 1985-89: 1990-94: 1995-99: 2000-04: 2005-09: 2010-14:
310 487 633 829 1320 2540 7060 15000 21600 (projected result)
Chart 1. Data on number of articles on eye tracking published in the past decades7.
We can bring these results up-to-date and check the number of the articles published online with the key term “eye tracking”. Over the last years, the amount of such articles increased dramatically: 20188 268 000 (in Google Scholar).
Looking at the data one will notice immediately how abruptly interest in eye tracking research soared and, consequently, how the application of the eye tracking methodology widened. In the 1950s70s eye tracking was most popular in aviation and aerospace medicine, when researchers examined the response of people in extreme conditions triggered in flight simulations. Also scholars of neurology and psychology took a keen, unwavering interest in eye tracking, as they examined visual perception when reading, experiencing aesthetic sensations (e.g. looking at works of art) and also in studies on processes of concentration. With the time new disciplines were added to the aforementioned research fields where eye tracking studies were continued; these disciplines emerged as a result of consumer needs, commercial market requirements and following the development of technology and various socio-economic phenomena. During the last decade finally researchers from the field of arts and humanities have begun to implement eye tracking equipment in their studies expanding research possibilities, as well as cognitive boundaries. At present the range of application of eye tracking is significantly broader, which, however, goes beyond the subject of this article.
6 The firm iMotions has mostly analysed English publications, the search term was thus the
English phrase eye tracking, spelled without a hyphen. 7 Source: http://imotionsglobal.com/blog/exponential-growth-in-academic-eye-tracking-
papers-over-the-last-40-years/ accessed: 18.03.2014. 8 Accessed: 19.11.2018.
Applied Linguistics Papers: www.alp.uw.edu.pl
Monika PŁUŻYCZKA
114
References
Ahrens, A. (1891), Die Bewegung der Augen beim Schreiben. Rostock. Anliker, J. (1976), Eye Movements: On-line Measurement, Analysis, and Control.
In: R.S. Monty/ J.W. Senders (eds.), Eye Movements and Psychological Processes. Hillsdale, 185199. Bente, G. (2005), Erfassung und Analyse des Blickverhaltens. In: R. Mangold/ P. Vorderer/ G. Bente (eds.), Lehrbuch der Medienpsychologie. Göttingen, 297 324. Buswell, G.T. (1935), How People Look at Pictures. Chicago. Buswell, G.T. (1937), How Adults Read. Chicago. Delabarre, E.B. (1898), A Method of Recording Eye-movements. In: “Psychological Review” 8, 57274. Diefendor, A.R./ R. Dodge (1908), An Experimental Study of the Ocular Reactions of the Insane from Photographic Records. In: “Brain” 31, 451489. Dodge, R./ T.S. Cline (1901), The Angle Velocity of Eye Movements. In: “Psychological Review” 8, 145157. Fitts, P.M./ R.E. Jones/ J.L. Milton (1950), Eye Movements of Aircraft Pilots during Instrument-landing Approaches. In: “Aeronautical Engineering Review” 9 (2), 2429. Gollücke, V. (2009), Eye-Tracking Grundlagen, Technologien und Anwendungsgebiete, Studienarbeit. Grin Verlag. Grobelny, J./ K. Jach/ M. Kuliński/ R. Michalski (2006), Śledzenie wzroku w badaniach jakości użytkowej oprogramowania. Historia i mierniki. (URL https://repin.pjwstk.edu.pl/xmlui/bitstream/handle/186319/166/Kansei%202006 _Grobelny.pdf?sequence=1). Hartridge, H./ L.C. Thompson (1948), Methods of Investigating Eye Movements. In: “British Journal of Ophthalmology” 32, 581591. Huey, E.B. (1898), Preliminary Experiments in the Physiology and Psychology of Reading. In: “American Journal of Psychology” 9, 575586. Huey, E.B. (1900), On the Psychology and Physiology of Reading. Part I. In: “American Journal of Psychology” 11 (3), 283302. Huey, E.B. (1901), On the Psychology and Physiology of Reading. Part II. In: “American Journal of Psychology” 12 (3), 292312. Jacob, R.J.K./ K.S. Karn (2003), Eye Tracking in Human-Computer Interaction and Usability Research: Ready to Deliver the Promises. In: J. Hyönä/ R. Radach/ H. Deubel (eds.), The Minds Eye: Cognitive and Applied Aspects of Eye Movement Research. Amsterdam, 573605. Javal, L.É. (1878), Essai sur la physiologie de la lecture. In: “Annales d'Oculistique”, 80, 240274. Javal, L.É. (1879), Essai sur la physiologie de la lecture. In: “Annales d'Oculistique” 82, 242253. Javal, L.É. (1905), Physiologie de la lecture et de l'écriture. Paris: Alcan. Judd, C.H./ C.N. McAllister/ W.M. Steel (1905), General introduction to a series of studies of eye movements by means of kinetoscopic photographs. In: J.M. Bald-
Applied Linguistics Papers: www.alp.uw.edu.pl
The First Hundred Years …
115
win/ H.C. Warren/ C.H. Judd (eds.), Psychological Review, Monograph Supplements. Baltimore, 116. Just, M.A./ P.A. Carpenter (1976a), Eye Fixations and Cognitive Processes. In: “Cognitive Psychology” 8, 441480. Just, M.A./ P.A. Carpenter (1976b), The Role of Eye-fixation Research in Cognitive Psychology. In: “Behavior Research Methods & Instrumentation” 8, 139143. Just, M.A./ P.A. Carpenter (1980), A Theory of Reading: From Eye Fixations to Comprehension. In: “Psychological Review” 87 (4), 329354. Kliegl, R./ R.K. Olson (1981), Reduction and Calibration of Eye Movement Data. In: “Behavior Research Methods and Instrumentation” 13, 107111. Landolt, E. (1891), Nouvelle recherches sur la physiologie des mouvements des yeux. In: “Archives dOphthalmologie” 11, 385395. Mackworth, J.F./ N.H. Mackworth (1958), Eye Fixations Recorded on Changing Visual Scenes by Television Eye-marker. In: “Journal of Optical Society of America” 52, 713716. Mackworth, N.H./ E.L. Thomas (1962), Head-mounted Eye-marker Camera. In: “Journal of the Optical Society of America” 52, 713716. McConkie, G.W./ K. Rayner (1975), The Span of the Effective Stimulus during a Fixation in Reading. In: “Perception & Psychophysics” 17, 578586. Merchant J. (1966), Interim Technical Report. Oculometer. Contract No. NASW1159. February 25, 1965 December 25, 1965. Boston. Merchant, J./ R. Morrissette/ J.L. Porterfield (1974), Remote Measurement of Eye Direction Allowing Subject Motion over One Cubic Foot of Space. In: “IEEE Transactions on Biomedical Engineering” 21 (4), 309317. Monty, R.A. (1975), An Advanced Eye-movement Measuring and Recording System. In: “American Psychologist” 30, 331335. Monty, R.A./ J.W. Senders (eds.) (1976), Eye Movements and Psychological Processes, Hillsdale. Płużyczka, M. (2015), Tłumaczenie a vista. Rozważania teoretyczne i badania eyetrackingowe. Warszawa. Rählmann, E. (1878), Über den Nystagmus und seine Ätiologie. In: “Archiv für Ophthalmologie” 24, 237242. Rayner, K. (1975), The Perceptual Span and Peripheral Cues in Reading. In: “Cognitive Psychology” 7, 6581. Rayner, K. (1977), Visual Attention in Reading: Eye Movements Reflect Cognitive Processes. In: “Memory & Cognition” 5, 443448. Rayner, K. (1983), Eye Movements in Reading: Perceptual and Language Processes. San Diego. Rayner, K. (1998), Eye Movements in Reading and Information Processing: 20 Years of Research. In: “Psychological Bulletin” 124 (3), 372422. Rayner, K./ G.W. McConkie (1976), What Guides a Reader's Eye Movements?. In: “Vision Research” 16, 829837. Rayner, K./ G.W. McConkie/ D. Zola (1980), Integrating Information across Eye Movements. In: “Cognitive Psychology” 12, 206226.
Applied Linguistics Papers: www.alp.uw.edu.pl
Monika PŁUŻYCZKA
116
Reder, S.M. (1973), On-line Monitoring of Eye Position Signals in Contingent and Noncontingent Paradigms. In: “Behavior Research Methods and Instrumentation” 5, 218228.
Scinto, L.F./ B.D. Barnette (1986), An Algorithm for Determining Clusters, Pairs or Singletons in Eye-movement Scan-path Records. In: “Behavior Research Methods, Instruments & Computers” 18, 4144.
Shackel, B. (1960), Note on Mobile Eye Viewpoint Recording. In: “Journal of the Optical Society of America” 59, 763768.
Stone, L.G. (1941), Reading Reactions for Varied Types of Subject Matter: An Analytical Study of the Eye-movements of College Freshmen. In: “Journal of Experimental Education” 10 (1), 6477.
Tinker, M.A. (1946), A Study of Eye Movements in Reading. In: “Psychological Bulletin” 43 (2), 93120.
Tinker, M.A. (1958), Recent Studies of Eye Movements in Reading. In: “Psychological Bulletin” 55 (4), 215231.
Tscherning, M.H.E. (1898), Optique physiologique. Paris: Carré and Naud. Wade, N.J./ B.W. Tatler (2009), Did Javal Measure Eye Movements during Read-
ing?, (w:) “Journal of Eye Movement Research” 2(5), 17. Walker, R.Y. (1933), The Eye-movements of Good Readers. In: “Psychological
Monographs” 44 (3), 95117. Yarbus, A.L. (1967), Eye Movements and Vision. New York.
Applied Linguistics Papers: www.alp.uw.edu.pl

View File

@@ -0,0 +1,14 @@
Title: Microsoft Word - ALP 25_4 9 Monika PLUYCZKA
Author: sambor
Creator: PScript5.dll Version 5.2.2
Producer: Acrobat Distiller 10.1.16 (Windows)
CreationDate: 01/14/19 17:33:20
ModDate: 01/14/19 17:33:34
Tagged: no
Form: none
Pages: 16
Encrypted: no
Page size: 499 x 709 pts (rotated 0 degrees)
File size: 825119 bytes
Optimized: yes
PDF version: 1.5

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
Title:
Author:
Creator: LaTeX with hyperref package
Producer: xdvipdfmx (0.7.8)
CreationDate: 04/13/17 18:22:02
ModDate: 04/18/22 08:30:29
Tagged: no
Form: AcroForm
Pages: 291
Encrypted: no
Page size: 541.417 x 666.142 pts (rotated 0 degrees)
File size: 8704512 bytes
Optimized: yes
PDF version: 1.4

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,137 @@
See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/228403640
Index of Learning Styles Questionnaire
Article · January 1999
CITATIONS
913
2 authors, including:
Richard M. Felder North Carolina State University 337 PUBLICATIONS 34,725 CITATIONS
SEE PROFILE
READS
78,733
All content following this page was uploaded by Richard M. Felder on 28 May 2014.
The user has requested enhancement of the downloaded file.
Index of Learning Styles Questionnaire
NC STATE UNIVERSITY
http://www.engr.ncsu.edu/learningstyles/ilsweb.html
Index of Learning Styles Questionnaire
Barbara A. Soloman First-Year College North Carolina State University Raleigh, North Carolina 27695
Richard M. Felder Department of Chemical Engineering
North Carolina State University Raleigh, NC 27695-7905
Directions
Please provide us with your full name. Your name will be printed on the information that is returned to you.
Full Name
For each of the 44 questions below select either "a" or "b" to indicate your answer. Please choose only one answer for each question. If both "a" and "b" seem to apply to you, choose the one that applies more frequently. When you are finished selecting answers to each question please select the submit button at the end of the form.
1. I understand something better after I (a) try it out. (b) think it through.
2. I would rather be considered (a) realistic. (b) innovative.
3. When I think about what I did yesterday, I am most likely to get (a) a picture. (b) words.
4. I tend to
1 of 5
8/21/08 1:39 PM
Index of Learning Styles Questionnaire
http://www.engr.ncsu.edu/learningstyles/ilsweb.html
(a) understand details of a subject but may be fuzzy about its overall structure.
(b) understand the overall structure but may be fuzzy about details.
5. When I am learning something new, it helps me to (a) talk about it. (b) think about it.
6. If I were a teacher, I would rather teach a course (a) that deals with facts and real life situations. (b) that deals with ideas and theories.
7. I prefer to get new information in (a) pictures, diagrams, graphs, or maps. (b) written directions or verbal information.
8. Once I understand (a) all the parts, I understand the whole thing. (b) the whole thing, I see how the parts fit.
9. In a study group working on difficult material, I am more likely to (a) jump in and contribute ideas. (b) sit back and listen.
10. I find it easier (a) to learn facts. (b) to learn concepts.
11. In a book with lots of pictures and charts, I am likely to (a) look over the pictures and charts carefully. (b) focus on the written text.
12. When I solve math problems (a) I usually work my way to the solutions one step at a time. (b) I often just see the solutions but then have to struggle to figure
out the steps to get to them.
13. In classes I have taken (a) I have usually gotten to know many of the students. (b) I have rarely gotten to know many of the students.
14. In reading nonfiction, I prefer (a) something that teaches me new facts or tells me how to do
something. (b) something that gives me new ideas to think about.
15. I like teachers (a) who put a lot of diagrams on the board.
2 of 5
8/21/08 1:39 PM
Index of Learning Styles Questionnaire
http://www.engr.ncsu.edu/learningstyles/ilsweb.html
(b) who spend a lot of time explaining.
16. When I'm analyzing a story or a novel (a) I think of the incidents and try to put them together to figure out
the themes. (b) I just know what the themes are when I finish reading and then
I have to go back and find the incidents that demonstrate them.
17. When I start a homework problem, I am more likely to (a) start working on the solution immediately. (b) try to fully understand the problem first.
18. I prefer the idea of (a) certainty. (b) theory.
19. I remember best (a) what I see. (b) what I hear.
20. It is more important to me that an instructor (a) lay out the material in clear sequential steps. (b) give me an overall picture and relate the material to other
subjects.
21. I prefer to study (a) in a study group. (b) alone.
22. I am more likely to be considered (a) careful about the details of my work. (b) creative about how to do my work.
23. When I get directions to a new place, I prefer (a) a map. (b) written instructions.
24. I learn (a) at a fairly regular pace. If I study hard, I'll "get it." (b) in fits and starts. I'll be totally confused and then suddenly it all
"clicks."
25. I would rather first (a) try things out. (b) think about how I'm going to do it.
26. When I am reading for enjoyment, I like writers to (a) clearly say what they mean.
3 of 5
8/21/08 1:39 PM
Index of Learning Styles Questionnaire
http://www.engr.ncsu.edu/learningstyles/ilsweb.html
(b) say things in creative, interesting ways.
27. When I see a diagram or sketch in class, I am most likely to remember (a) the picture. (b) what the instructor said about it.
28. When considering a body of information, I am more likely to (a) focus on details and miss the big picture. (b) try to understand the big picture before getting into the details.
29. I more easily remember (a) something I have done. (b) something I have thought a lot about.
30. When I have to perform a task, I prefer to (a) master one way of doing it. (b) come up with new ways of doing it.
31. When someone is showing me data, I prefer (a) charts or graphs. (b) text summarizing the results.
32. When writing a paper, I am more likely to (a) work on (think about or write) the beginning of the paper and
progress forward. (b) work on (think about or write) different parts of the paper and
then order them.
33. When I have to work on a group project, I first want to (a) have "group brainstorming" where everyone contributes ideas. (b) brainstorm individually and then come together as a group to
compare ideas.
34. I consider it higher praise to call someone (a) sensible. (b) imaginative.
35. When I meet people at a party, I am more likely to remember (a) what they looked like. (b) what they said about themselves.
36. When I am learning a new subject, I prefer to (a) stay focused on that subject, learning as much about it as I can. (b) try to make connections between that subject and related
subjects.
37. I am more likely to be considered (a) outgoing.
4 of 5
8/21/08 1:39 PM
Index of Learning Styles Questionnaire
http://www.engr.ncsu.edu/learningstyles/ilsweb.html
(b) reserved.
38. I prefer courses that emphasize (a) concrete material (facts, data). (b) abstract material (concepts, theories).
39. For entertainment, I would rather (a) watch television. (b) read a book.
40. Some teachers start their lectures with an outline of what they will cover. Such outlines are (a) somewhat helpful to me. (b) very helpful to me.
41. The idea of doing homework in groups, with one grade for the entire group, (a) appeals to me. (b) does not appeal to me.
42. When I am doing long calculations, (a) I tend to repeat all my steps and check my work carefully. (b) I find checking my work tiresome and have to force myself to
do it.
43. I tend to picture places I have been (a) easily and fairly accurately. (b) with difficulty and without much detail.
44. When solving problems in a group, I would be more likely to (a) think of the steps in the solution process. (b) think of possible consequences or applications of the solution
in a wide range of areas.
When you have completed filling out the above form please click on the Submit button below. Your results will be returned to you. If you are not satisified with your answers above please click on Reset to clear the form.
Submit Reset
Dr. Richard Felder, felder@ncsu.edu
5 of 5
View publication stats
8/21/08 1:39 PM

View File

@@ -0,0 +1,14 @@
Title: Index of Learning Styles Questionnaire
Author: System Administrator
Creator: Firefox
Producer: Mac OS X 10.4.11 Quartz PDFContext
CreationDate: 08/21/08 13:39:11
ModDate: 08/21/08 13:39:11
Tagged: no
Form: none
Pages: 6
Encrypted: no
Page size: 612 x 792 pts (letter) (rotated 0 degrees)
File size: 125329 bytes
Optimized: no
PDF version: 1.3

View File

@@ -0,0 +1,565 @@
In Presence: Teleoperators and Virtual Environments 6, 4 (August 1997), 355-385.
A Survey of Augmented Reality
Ronald T. Azuma Hughes Research Laboratories 3011 Malibu Canyon Road, MS RL96
Malibu, CA 90265 azuma@isl.hrl.hac.com http://www.cs.unc.edu/~azuma
W: (310) 317-5151 Fax: (310) 317-5695
Abstract
This paper surveys the field of Augmented Reality, in which 3-D virtual objects are integrated into a 3-D real environment in real time. It describes the medical, manufacturing, visualization, path planning, entertainment and military applications that have been explored. This paper describes the characteristics of Augmented Reality systems, including a detailed discussion of the tradeoffs between optical and video blending approaches. Registration and sensing errors are two of the biggest problems in building effective Augmented Reality systems, so this paper summarizes current efforts to overcome these problems. Future directions and areas requiring further research are discussed. This survey provides a starting point for anyone interested in researching or using Augmented Reality.
1. Introduction
1.1 Goals
This paper surveys the current state-of-the-art in Augmented Reality. It describes work performed at many different sites and explains the issues and problems encountered when building Augmented Reality systems. It summarizes the tradeoffs and approaches taken so far to overcome these problems and speculates on future directions that deserve exploration.
A survey paper does not present new research results. The contribution comes from consolidating existing information from many sources and publishing an extensive bibliography of papers in this field. While several other introductory papers have been written on this subject [Barfield95] [Bowskill95] [Caudell94] [Drascic93b] [Feiner94a] [Feiner94b] [Milgram94b] [Rolland94], this survey is more comprehensive and up-to-date. This survey provides a good beginning point for anyone interested in starting research in this area.
Section 1 describes what Augmented Reality is and the motivations for developing this technology. Six classes of potential applications that have been explored are described in Section 2. Then Section 3 discusses the issues involved in
building an Augmented Reality system. Currently, two of the biggest problems are in registration and sensing: the subjects of Sections 4 and 5. Finally, Section 6 describes some areas that require further work and research.
1.2 Definition
Augmented Reality (AR) is a variation of Virtual Environments (VE), or Virtual Reality as it is more commonly called. VE technologies completely immerse a user inside a synthetic environment. While immersed, the user cannot see the real world around him. In contrast, AR allows the user to see the real world, with virtual objects superimposed upon or composited with the real world. Therefore, AR supplements reality, rather than completely replacing it. Ideally, it would appear to the user that the virtual and real objects coexisted in the same space, similar to the effects achieved in the film "Who Framed Roger Rabbit?" Figure 1 shows an example of what this might look like. It shows a real desk with a real phone. Inside this room are also a virtual lamp and two virtual chairs. Note that the objects are combined in 3-D, so that the virtual lamp covers the real table, and the real table covers parts of the two virtual chairs. AR can be thought of as the "middle ground" between VE (completely synthetic) and telepresence (completely real) [Milgram94a] [Milgram94b].
Figure 1: Real desk with virtual lamp and two virtual chairs. (Courtesy ECRC)
Some researchers define AR in a way that requires the use of Head-Mounted Displays (HMDs). To avoid limiting AR to specific technologies, this survey defines AR as systems that have the following three characteristics:
1) Combines real and virtual 2) Interactive in real time 3) Registered in 3-D
This definition allows other technologies besides HMDs while retaining the essential components of AR. For example, it does not include film or 2-D overlays. Films like "Jurassic Park" feature photorealistic virtual objects seamlessly blended with a real environment in 3-D, but they are not interactive media. 2-D virtual overlays on top of live video can be done at interactive rates, but the overlays are not combined with the real world in 3-D. However, this definition does allow monitor-
2
based interfaces, monocular systems, see-through HMDs, and various other combining technologies. Potential system configurations are discussed further in Section 3.
1.3 Motivation
Why is Augmented Reality an interesting topic? Why is combining real and virtual objects in 3-D useful? Augmented Reality enhances a user's perception of and interaction with the real world. The virtual objects display information that the user cannot directly detect with his own senses. The information conveyed by the virtual objects helps a user perform real-world tasks. AR is a specific example of what Fred Brooks calls Intelligence Amplification (IA): using the computer as a tool to make a task easier for a human to perform [Brooks96].
At least six classes of potential AR applications have been explored: medical visualization, maintenance and repair, annotation, robot path planning, entertainment, and military aircraft navigation and targeting. The next section describes work that has been done in each area. While these do not cover every potential application area of this technology, they do cover the areas explored so far.
2. Applications
2.1 Medical
Doctors could use Augmented Reality as a visualization and training aid for surgery. It may be possible to collect 3-D datasets of a patient in real time, using non-invasive sensors like Magnetic Resonance Imaging (MRI), Computed Tomography scans (CT), or ultrasound imaging. These datasets could then be rendered and combined in real time with a view of the real patient. In effect, this would give a doctor "X-ray vision" inside a patient. This would be very useful during minimally-invasive surgery, which reduces the trauma of an operation by using small incisions or no incisions at all. A problem with minimally-invasive techniques is that they reduce the doctor's ability to see inside the patient, making surgery more difficult. AR technology could provide an internal view without the need for larger incisions.
AR might also be helpful for general medical visualization tasks in the surgical room. Surgeons can detect some features with the naked eye that they cannot see in MRI or CT scans, and vice-versa. AR would give surgeons access to both types of data simultaneously. This might also guide precision tasks, such as displaying where to drill a hole into the skull for brain surgery or where to perform a needle biopsy of a tiny tumor. The information from the non-invasive sensors would be directly displayed on the patient, showing exactly where to perform the operation.
3
AR might also be useful for training purposes [Kancherla95]. Virtual instructions could remind a novice surgeon of the required steps, without the need to look away from a patient to consult a manual. Virtual objects could also identify organs and specify locations to avoid disturbing [Durlach95].
Several projects are exploring this application area. At UNC Chapel Hill, a research group has conducted trial runs of scanning the womb of a pregnant woman with an ultrasound sensor, generating a 3-D representation of the fetus inside the womb and displaying that in a see-through HMD (Figure 2). The goal is to endow the doctor with the ability to see the moving, kicking fetus lying inside the womb, with the hope that this one day may become a "3-D stethoscope" [Bajura92] [State94]. More recent efforts have focused on a needle biopsy of a breast tumor. Figure 3 shows a mockup of a breast biopsy operation, where the virtual objects identify the location of the tumor and guide the needle to its target [State96b]. Other groups at the MIT AI Lab [Grimson94] [Grimson95] [Mellor95a] [Mellor95b], General Electric [Lorensen93], and elsewhere [Betting95] [Edwards95] [Taubes94] are investigating displaying MRI or CT data, directly registered onto the patient.
Figure 2: Virtual fetus inside womb of pregnant patient. (Courtesy UNC Chapel Hill Dept. of Computer Science.)
Figure 3: Mockup of breast tumor biopsy. 3-D graphics guide needle insertion. (Courtesy UNC Chapel Hill Dept. of Computer Science.)
2.2 Manufacturing and repair
4
Another category of Augmented Reality applications is the assembly, maintenance, and repair of complex machinery. Instructions might be easier to understand if they were available, not as manuals with text and pictures, but rather as 3-D drawings superimposed upon the actual equipment, showing step-by-step the tasks that need to be done and how to do them. These superimposed 3-D drawings can be animated, making the directions even more explicit.
Several research projects have demonstrated prototypes in this area. Steve Feiner's group at Columbia built a laser printer maintenance application [Feiner93a], shown in Figures 4 and 5. Figure 4 shows an external view, and Figure 5 shows the user's view, where the computer-generated wireframe is telling the user to remove the paper tray. A group at Boeing is developing AR technology to guide a technician in building a wiring harness that forms part of an airplane's electrical system. Storing these instructions in electronic form will save space and reduce costs. Currently, technicians use large physical layout boards to construct such harnesses, and Boeing requires several warehouses to store all these boards. Such space might be emptied for other use if this application proves successful [Caudell92] [Janin93] [Sims94]. Boeing is using a Technology Reinvestment Program (TRP) grant to investigate putting this technology onto the factory floor [BoeingTRP94]. Figure 6 shows an external view of Adam Janin using a prototype AR system to build a wire bundle. Eventually, AR might be used for any complicated machinery, such as automobile engines [Tuceryan95].
Figure 4: External view of Columbia printer maintenance application. Note that all objects must be tracked. (Courtesy Steve Feiner, Blair MacIntyre, and Dorée
Seligmann, Columbia University.)
Figure 5: Prototype laser printer maintenance application, displaying how to remove the paper tray. (Courtesy Steve Feiner, Blair MacIntyre, and Dorée
Seligmann, Columbia University.)
5
Figure 6: Adam Janin demonstrates Boeing's prototype wire bundle assembly application. (Courtesy David Mizell, Boeing)
2.3 Annotation and visualization
AR could be used to annotate objects and environments with public or private information. Applications using public information assume the availability of public databases to draw upon. For example, a hand-held display could provide information about the contents of library shelves as the user walks around the library [Fitzmaurice93] [Rekimoto95a] [Rekimoto95b]. At the European Computer-Industry Research Centre (ECRC), a user can point at parts of an engine model and the AR system displays the name of the part that is being pointed at [Rose95]. Figure 7 shows this, where the user points at the exhaust manifold on an engine model and the label "exhaust manifold" appears.
Figure 7: Engine model part labels appear as user points at them. (Courtesy ECRC)
Alternately, these annotations might be private notes attached to specific objects. Researchers at Columbia demonstrated this with the notion of attaching windows from a standard user interface onto specific locations in the world, or attached to specific objects as reminders [Feiner93b]. Figure 8 shows a window superimposed as a label upon a student. He wears a tracking device, so the computer knows his location. As the student moves around, the label follows his location, providing the AR user with a reminder of what he needs to talk to the student about.
6
Figure 8: Windows displayed on top of specific real-world objects. (Courtesy Steve Feiner, Blair MacIntyre, Marcus Haupt, and Eliot Solomon, Columbia
University.) AR might aid general visualization tasks as well. An architect with a seethrough HMD might be able to look out a window and see how a proposed new skyscraper would change her view. If a database containing information about a building's structure was available, AR might give architects "X-ray vision" inside a building, showing where the pipes, electric lines, and structural supports are inside the walls [Feiner95]. Researchers at the University of Toronto have built a system called Augmented Reality through Graphic Overlays on Stereovideo (ARGOS) [Milgram95], which among other things is used to make images easier to understand during difficult viewing conditions [Drascic93a]. Figure 9 shows wireframe lines drawn on top of a space shuttle bay interior, while in orbit. The lines make it easier to see the geometry of the shuttle bay. Similarly, virtual lines and objects could aid navigation and scene understanding during poor visibility conditions, such as underwater or in fog.
Figure 9: Virtual lines help display geometry of shuttle bay, as seen in orbit. (Courtesy David Drascic and Paul Milgram, U. Toronto.)
2.4 Robot path planning
Teleoperation of a robot is often a difficult problem, especially when the robot is far away, with long delays in the communication link. Under this circumstance,
7
instead of controlling the robot directly, it may be preferable to instead control a virtual version of the robot. The user plans and specifies the robot's actions by manipulating the local virtual version, in real time. The results are directly displayed on the real world. Once the plan is tested and determined, then user tells the real robot to execute the specified plan. This avoids pilot-induced oscillations caused by the lengthy delays. The virtual versions can also predict the effects of manipulating the environment, thus serving as a planning and previewing tool to aid the user in performing the desired task. The ARGOS system has demonstrated that stereoscopic AR is an easier and more accurate way of doing robot path planning than traditional monoscopic interfaces [Drascic93b] [Milgram93]. Others have also used registered overlays with telepresence systems [Kim93] [Kim96] [Oyama93] [Tharp94] [Yoo93]. Figure 10 shows how a virtual outline can represent a future location of a robot arm.
Figure 10: Virtual lines show a planned motion of a robot arm (Courtesy David Drascic and Paul Milgram, U. Toronto.)
2.5 Entertainment
At SIGGRAPH '95, several exhibitors showed "Virtual Sets" that merge real actors with virtual backgrounds, in real time and in 3-D. The actors stand in front of a large blue screen, while a computer-controlled motion camera records the scene. Since the camera's location is tracked, and the actor's motions are scripted, it is possible to digitally composite the actor into a 3-D virtual background. For example, the actor might appear to stand inside a large virtual spinning ring, where the front part of the ring covers the actor while the rear part of the ring is covered by the actor. The entertainment industry sees this as a way to reduce production costs: creating and storing sets virtually is potentially cheaper than constantly building new physical sets from scratch. The ALIVE project from the MIT Media Lab goes one step further by
8
populating the environment with intelligent virtual creatures that respond to user actions [Maes95].
2.6 Military aircraft
For many years, military aircraft and helicopters have used Head-Up Displays (HUDs) and Helmet-Mounted Sights (HMS) to superimpose vector graphics upon the pilot's view of the real world. Besides providing basic navigation and flight information, these graphics are sometimes registered with targets in the environment, providing a way to aim the aircraft's weapons. For example, the chin turret in a helicopter gunship can be slaved to the pilot's HMS, so the pilot can aim the chin turret simply by looking at the target. Future generations of combat aircraft will be developed with an HMD built into the pilot's helmet [Wanstall89].
3. Characteristics
This section discusses the characteristics of AR systems and design issues encountered when building an AR system. Section 3.1 describes the basic characteristics of augmentation. There are two ways to accomplish this augmentation: optical or video technologies. Section 3.2 discusses their characteristics and relative strengths and weaknesses. Blending the real and virtual poses problems with focus and contrast (Section 3.3), and some applications require portable AR systems to be truly effective (Section 3.4). Finally, Section 3.5 summarizes the characteristics by comparing the requirements of AR against those for Virtual Environments.
3.1 Augmentation
Besides adding objects to a real environment, Augmented Reality also has the potential to remove them. Current work has focused on adding virtual objects to a real environment. However, graphic overlays might also be used to remove or hide parts of the real environment from a user. For example, to remove a desk in the real environment, draw a representation of the real walls and floors behind the desk and "paint" that over the real desk, effectively removing it from the user's sight. This has been done in feature films. Doing this interactively in an AR system will be much harder, but this removal may not need to be photorealistic to be effective.
Augmented Reality might apply to all senses, not just sight. So far, researchers have focused on blending real and virtual images and graphics. However, AR could be extended to include sound. The user would wear headphones equipped with microphones on the outside. The headphones would add synthetic, directional 3D sound, while the external microphones would detect incoming sounds from the environment. This would give the system a chance to mask or cover up selected real sounds from the environment by generating a masking signal that exactly canceled
9
the incoming real sound [Durlach95]. While this would not be easy to do, it might be possible. Another example is haptics. Gloves with devices that provide tactile feedback might augment real forces in the environment. For example, a user might run his hand over the surface of a real desk. Simulating such a hard surface virtually is fairly difficult, but it is easy to do in reality. Then the tactile effectors in the glove can augment the feel of the desk, perhaps making it feel rough in certain spots. This capability might be useful in some applications, such as providing an additional cue that a virtual object is at a particular location on a real desk [Wellner93].
3.2 Optical vs. video
A basic design decision in building an AR system is how to accomplish the combining of real and virtual. Two basic choices are available: optical and video technologies. Each has particular advantages and disadvantages. This section compares the two and notes the tradeoffs. For additional discussion, see [Rolland94].
A see-through HMD is one device used to combine real and virtual. Standard closed-view HMDs do not allow any direct view of the real world. In contrast, a seethrough HMD lets the user see the real world, with virtual objects superimposed by optical or video technologies.
Optical see-through HMDs work by placing optical combiners in front of the user's eyes. These combiners are partially transmissive, so that the user can look directly through them to see the real world. The combiners are also partially reflective, so that the user sees virtual images bounced off the combiners from headmounted monitors. This approach is similar in nature to Head-Up Displays (HUDs) commonly used in military aircraft, except that the combiners are attached to the head. Thus, optical see-through HMDs have sometimes been described as a "HUD on a head" [Wanstall89]. Figure 11 shows a conceptual diagram of an optical seethrough HMD. Figure 12 shows two optical see-through HMDs made by Hughes Electronics.
The optical combiners usually reduce the amount of light that the user sees from the real world. Since the combiners act like half-silvered mirrors, they only let in some of the light from the real world, so that they can reflect some of the light from the monitors into the user's eyes. For example, the HMD described in [Holmgren92] transmits about 30% of the incoming light from the real world. Choosing the level of blending is a design problem. More sophisticated combiners might vary the level of contributions based upon the wavelength of light. For example, such a combiner might be set to reflect all light of a certain wavelength and none at any other wavelengths. This would be ideal with a monochrome monitor. Virtually all the light from the monitor would be reflected into the user's eyes, while almost all the light from the real world (except at the particular wavelength) would reach the user's eyes. However, most existing optical see-through HMDs do reduce the amount of light from the real world, so they act like a pair of sunglasses when the power is cut off.
10
Scene generator
Head locations
Head Tracker
Monitors
Graphic images
Optical combiners
Real world
Figure 11: Optical see-through HMD conceptual diagram
Figure 12: Two optical see-through HMDs, made by Hughes Electronics
In contrast, video see-through HMDs work by combining a closed-view HMD with one or two head-mounted video cameras. The video cameras provide the user's view of the real world. Video from these cameras is combined with the graphic images created by the scene generator, blending the real and virtual. The result is sent to the monitors in front of the user's eyes in the closed-view HMD. Figure 13 shows a conceptual diagram of a video see-through HMD. Figure 14 shows an actual video see-through HMD, with two video cameras mounted on top of a Flight Helmet.
Video of real world
Head locations
Scene generator
Graphic images
Video compositor
Head Tracker
Video cameras
Real World
Monitors
Combined video
11
Figure 13: Video see-through HMD conceptual diagram
Figure 14: An actual video see-through HMD. (Courtesy Jannick Rolland, Frank Biocca, and UNC Chapel Hill Dept. of Computer Science. Photo by Alex
Treml.) Video composition can be done in more than one way. A simple way is to use chroma-keying: a technique used in many video special effects. The background of the computer graphic images is set to a specific color, say green, which none of the virtual objects use. Then the combining step replaces all green areas with the corresponding parts from the video of the real world. This has the effect of superimposing the virtual objects over the real world. A more sophisticated composition would use depth information. If the system had depth information at each pixel for the real world images, it could combine the real and virtual images by a pixel-by-pixel depth comparison. This would allow real objects to cover virtual objects and vice-versa. AR systems can also be built using monitor-based configurations, instead of see-through HMDs. Figure 15 shows how a monitor-based system might be built. In this case, one or two video cameras view the environment. The cameras may be static or mobile. In the mobile case, the cameras might move around by being attached to a robot, with their locations tracked. The video of the real world and the graphic images generated by a scene generator are combined, just as in the video seethrough HMD case, and displayed in a monitor in front of the user. The user does not wear the display device. Optionally, the images may be displayed in stereo on the monitor, which then requires the user to wear a pair of stereo glasses. Figure 16 shows an external view of the ARGOS system, which uses a monitor-based configuration.
12
Tracker
Monitor
Stereo glasses (optional)
Locations
Video cameras
Video of real world
Scene generator Graphic
Combiner
images
Figure 15: Monitor-based AR conceptual diagram
Figure 16: External view of the ARGOS system, an example of monitor-based AR. (Courtesy David Drascic and Paul Milgram, U. Toronto.)
Finally, a monitor-based optical configuration is also possible. This is similar to Figure 11 except that the user does not wear the monitors or combiners on her head. Instead, the monitors and combiners are fixed in space, and the user positions her head to look through the combiners. This is typical of Head-Up Displays on military aircraft, and at least one such configuration has been proposed for a medical application [Peuchot95].
The rest of this section compares the relative advantages and disadvantages of optical and video approaches, starting with optical. An optical approach has the following advantages over a video approach:
1) Simplicity: Optical blending is simpler and cheaper than video blending. Optical approaches have only one "stream" of video to worry about: the graphic images. The real world is seen directly through the combiners, and that time delay is generally a few nanoseconds. Video blending, on the other hand, must deal with separate video streams for the real and virtual images. Both streams have inherent
13
delays in the tens of milliseconds. Digitizing video images usually adds at least one frame time of delay to the video stream, where a frame time is how long it takes to completely update an image. A monitor that completely refreshes the screen at 60 Hz has a frame time of 16.67 ms. The two streams of real and virtual images must be properly synchronized or temporal distortion results. Also, optical see-through HMDs with narrow field-of-view combiners offer views of the real world that have little distortion. Video cameras almost always have some amount of distortion that must be compensated for, along with any distortion from the optics in front of the display devices. Since video requires cameras and combiners that optical approaches do not need, video will probably be more expensive and complicated to build than optical-based systems.
2) Resolution: Video blending limits the resolution of what the user sees, both real and virtual, to the resolution of the display devices. With current displays, this resolution is far less than the resolving power of the fovea. Optical see-through also shows the graphic images at the resolution of the display device, but the user's view of the real world is not degraded. Thus, video reduces the resolution of the real world, while optical see-through does not.
3) Safety: Video see-through HMDs are essentially modified closed-view HMDs. If the power is cut off, the user is effectively blind. This is a safety concern in some applications. In contrast, when power is removed from an optical seethrough HMD, the user still has a direct view of the real world. The HMD then becomes a pair of heavy sunglasses, but the user can still see.
4) No eye offset: With video see-through, the user's view of the real world is provided by the video cameras. In essence, this puts his "eyes" where the video cameras are. In most configurations, the cameras are not located exactly where the user's eyes are, creating an offset between the cameras and the real eyes. The distance separating the cameras may also not be exactly the same as the user's interpupillary distance (IPD). This difference between camera locations and eye locations introduces displacements from what the user sees compared to what he expects to see. For example, if the cameras are above the user's eyes, he will see the world from a vantage point slightly taller than he is used to. Video see-through can avoid the eye offset problem through the use of mirrors to create another set of optical paths that mimic the paths directly into the user's eyes. Using those paths, the cameras will see what the user's eyes would normally see without the HMD. However, this adds complexity to the HMD design. Offset is generally not a difficult design problem for optical see-through displays. While the user's eye can rotate with respect to the position of the HMD, the resulting errors are tiny. Using the eye's center of rotation as the viewpoint in the computer graphics model should eliminate any need for eye tracking in an optical see-through HMD [Holloway95].
Video blending offers the following advantages over optical blending:
1) Flexibility in composition strategies: A basic problem with optical seethrough is that the virtual objects do not completely obscure the real world objects, because the optical combiners allow light from both virtual and real sources. Building an optical see-through HMD that can selectively shut out the light from the real world is difficult. In a normal optical system, the objects are designed to be in
14
focus at only one point in the optical path: the user's eye. Any filter that would selectively block out light must be placed in the optical path at a point where the image is in focus, which obviously cannot be the user's eye. Therefore, the optical system must have two places where the image is in focus: at the user's eye and the point of the hypothetical filter. This makes the optical design much more difficult and complex. No existing optical see-through HMD blocks incoming light in this fashion. Thus, the virtual objects appear ghost-like and semi-transparent. This damages the illusion of reality because occlusion is one of the strongest depth cues. In contrast, video see-through is far more flexible about how it merges the real and virtual images. Since both the real and virtual are available in digital form, video seethrough compositors can, on a pixel-by-pixel basis, take the real, or the virtual, or some blend between the two to simulate transparency. Because of this flexibility, video see-through may ultimately produce more compelling environments than optical see-through approaches.
2) Wide field-of-view: Distortions in optical systems are a function of the radial distance away from the optical axis. The further one looks away from the center of the view, the larger the distortions get. A digitized image taken through a distorted optical system can be undistorted by applying image processing techniques to unwarp the image, provided that the optical distortion is well characterized. This requires significant amounts of computation, but this constraint will be less important in the future as computers become faster. It is harder to build wide field-of-view displays with optical see-through techniques. Any distortions of the user's view of the real world must be corrected optically, rather than digitally, because the system has no digitized image of the real world to manipulate. Complex optics are expensive and add weight to the HMD. Wide field-of-view systems are an exception to the general trend of optical approaches being simpler and cheaper than video approaches.
3) Real and virtual view delays can be matched: Video offers an approach for reducing or avoiding problems caused by temporal mismatches between the real and virtual images. Optical see-through HMDs offer an almost instantaneous view of the real world but a delayed view of the virtual. This temporal mismatch can cause problems. With video approaches, it is possible to delay the video of the real world to match the delay from the virtual image stream. For details, see Section 4.3.
4) Additional registration strategies: In optical see-through, the only information the system has about the user's head location comes from the head tracker. Video blending provides another source of information: the digitized image of the real scene. This digitized image means that video approaches can employ additional registration strategies unavailable to optical approaches. Section 4.4 describes these in more detail.
5) Easier to match the brightness of real and virtual objects: This is discussed in Section 3.3.
Both optical and video technologies have their roles, and the choice of technology depends on the application requirements. Many of the mechanical assembly and repair prototypes use optical approaches, possibly because of the cost and safety issues. If successful, the equipment would have to be replicated in large numbers to equip workers on a factory floor. In contrast, most of the prototypes for
15
medical applications use video approaches, probably for the flexibility in blending real and virtual and for the additional registration strategies offered.
3.3 Focus and contrast
Focus can be a problem for both optical and video approaches. Ideally, the virtual should match the real. In a video-based system, the combined virtual and real image will be projected at the same distance by the monitor or HMD optics. However, depending on the video camera's depth-of-field and focus settings, parts of the real world may not be in focus. In typical graphics software, everything is rendered with a pinhole model, so all the graphic objects, regardless of distance, are in focus. To overcome this, the graphics could be rendered to simulate a limited depth-of-field, and the video camera might have an autofocus lens.
In the optical case, the virtual image is projected at some distance away from the user. This distance may be adjustable, although it is often fixed. Therefore, while the real objects are at varying distances from the user, the virtual objects are all projected to the same distance. If the virtual and real distances are not matched for the particular objects that the user is looking at, it may not be possible to clearly view both simultaneously.
Contrast is another issue because of the large dynamic range in real environments and in what the human eye can detect. Ideally, the brightness of the real and virtual objects should be appropriately matched. Unfortunately, in the worst case scenario, this means the system must match a very large range of brightness levels. The eye is a logarithmic detector, where the brightest light that it can handle is about eleven orders of magnitude greater than the smallest, including both darkadapted and light-adapted eyes. In any one adaptation state, the eye can cover about six orders of magnitude. Most display devices cannot come close to this level of contrast. This is a particular problem with optical technologies, because the user has a direct view of the real world. If the real environment is too bright, it will wash out the virtual image. If the real environment is too dark, the virtual image will wash out the real world. Contrast problems are not as severe with video, because the video cameras themselves have limited dynamic response, and the view of both the real and virtual is generated by the monitor, so everything must be clipped or compressed into the monitor's dynamic range.
3.4 Portability
In almost all Virtual Environment systems, the user is not encouraged to walk around much. Instead, the user navigates by "flying" through the environment, walking on a treadmill, or driving some mockup of a vehicle. Whatever the technology, the result is that the user stays in one place in the real world.
Some AR applications, however, will need to support a user who will walk around a large environment. AR requires that the user actually be at the place where the task is to take place. "Flying," as performed in a VE system, is no longer an
16
option. If a mechanic needs to go to the other side of a jet engine, she must physically move herself and the display devices she wears. Therefore, AR systems will place a premium on portability, especially the ability to walk around outdoors, away from controlled environments. The scene generator, the HMD, and the tracking system must all be self-contained and capable of surviving exposure to the environment. If this capability is achieved, many more applications that have not been tried will become available. For example, the ability to annotate the surrounding environment could be useful to soldiers, hikers, or tourists in an unfamiliar new location.
3.5 Comparison against virtual environments
The overall requirements of AR can be summarized by comparing them against the requirements for Virtual Environments, for the three basic subsystems that they require.
1) Scene generator: Rendering is not currently one of the major problems in AR. VE systems have much higher requirements for realistic images because they completely replace the real world with the virtual environment. In AR, the virtual images only supplement the real world. Therefore, fewer virtual objects need to be drawn, and they do not necessarily have to be realistically rendered in order to serve the purposes of the application. For example, in the annotation applications, text and 3-D wireframe drawings might suffice. Ideally, photorealistic graphic objects would be seamlessly merged with the real environment (see Section 7), but more basic problems have to be solved first.
2) Display device: The display devices used in AR may have less stringent requirements than VE systems demand, again because AR does not replace the real world. For example, monochrome displays may be adequate for some AR applications, while virtually all VE systems today use full color. Optical see-through HMDs with a small field-of-view may be satisfactory because the user can still see the real world with his peripheral vision; the see-through HMD does not shut off the user's normal field-of-view. Furthermore, the resolution of the monitor in an optical see-through HMD might be lower than what a user would tolerate in a VE application, since the optical see-through HMD does not reduce the resolution of the real environment.
3) Tracking and sensing: While in the previous two cases AR had lower requirements than VE, that is not the case for tracking and sensing. In this area, the requirements for AR are much stricter than those for VE systems. A major reason for this is the registration problem, which is described in the next section. The other factors that make the tracking and sensing requirements higher are described in Section 5.
17
4. Registration
4.1 The registration problem
One of the most basic problems currently limiting Augmented Reality applications is the registration problem. The objects in the real and virtual worlds must be properly aligned with respect to each other, or the illusion that the two worlds coexist will be compromised. More seriously, many applications demand accurate registration. For example, recall the needle biopsy application. If the virtual object is not where the real tumor is, the surgeon will miss the tumor and the biopsy will fail. Without accurate registration, Augmented Reality will not be accepted in many applications.
Registration problems also exist in Virtual Environments, but they are not nearly as serious because they are harder to detect than in Augmented Reality. Since the user only sees virtual objects in VE applications, registration errors result in visual-kinesthetic and visual-proprioceptive conflicts. Such conflicts between different human senses may be a source of motion sickness [Pausch92]. Because the kinesthetic and proprioceptive systems are much less sensitive than the visual system, visual-kinesthetic and visual-proprioceptive conflicts are less noticeable than visualvisual conflicts. For example, a user wearing a closed-view HMD might hold up her real hand and see a virtual hand. This virtual hand should be displayed exactly where she would see her real hand, if she were not wearing an HMD. But if the virtual hand is wrong by five millimeters, she may not detect that unless actively looking for such errors. The same error is much more obvious in a see-through HMD, where the conflict is visual-visual.
Furthermore, a phenomenon known as visual capture [Welch78] makes it even more difficult to detect such registration errors. Visual capture is the tendency of the brain to believe what it sees rather than what it feels, hears, etc. That is, visual information tends to override all other senses. When watching a television program, a viewer believes the sounds come from the mouths of the actors on the screen, even though they actually come from a speaker in the TV. Ventriloquism works because of visual capture. Similarly, a user might believe that her hand is where the virtual hand is drawn, rather than where her real hand actually is, because of visual capture. This effect increases the amount of registration error users can tolerate in Virtual Environment systems. If the errors are systematic, users might even be able to adapt to the new environment, given a long exposure time of several hours or days [Welch78].
Augmented Reality demands much more accurate registration than Virtual Environments [Azuma93]. Imagine the same scenario of a user holding up her hand, but this time wearing a see-through HMD. Registration errors now result in visualvisual conflicts between the images of the virtual and real hands. Such conflicts are easy to detect because of the resolution of the human eye and the sensitivity of the human visual system to differences. Even tiny offsets in the images of the real and virtual hands are easy to detect.
18
What angular accuracy is needed for good registration in Augmented Reality? A simple demonstration will show the order of magnitude required. Take out a dime and hold it at arm's length, so that it looks like a circle. The diameter of the dime covers about 1.2 to 2.0 degrees of arc, depending on your arm length. In comparison, the width of a full moon is about 0.5 degrees of arc! Now imagine a virtual object superimposed on a real object, but offset by the diameter of the full moon. Such a difference would be easy to detect. Thus, the angular accuracy required is a small fraction of a degree. The lower limit is bounded by the resolving power of the human eye itself. The central part of the retina is called the fovea, which has the highest density of color-detecting cones, about 120 per degree of arc, corresponding to a spacing of half a minute of arc [Jain89]. Observers can differentiate between a dark and light bar grating when each bar subtends about one minute of arc, and under special circumstances they can detect even smaller differences [Doenges85]. However, existing HMD trackers and displays are not capable of providing one minute of arc in accuracy, so the present achievable accuracy is much worse than that ultimate lower bound. In practice, errors of a few pixels are detectable in modern HMDs.
Registration of real and virtual objects is not limited to AR. Special-effects artists seamlessly integrate computer-generated 3-D objects with live actors in film and video. The difference lies in the amount of control available. With film, a director can carefully plan each shot, and artists can spend hours per frame, adjusting each by hand if necessary, to achieve perfect registration. As an interactive medium, AR is far more difficult to work with. The AR system cannot control the motions of the HMD wearer. The user looks where she wants, and the system must respond within tens of milliseconds.
Registration errors are difficult to adequately control because of the high accuracy requirements and the numerous sources of error. These sources of error can be divided into two types: static and dynamic. Static errors are the ones that cause registration errors even when the user's viewpoint and the objects in the environment remain completely still. Dynamic errors are the ones that have no effect until either the viewpoint or the objects begin moving.
For current HMD-based systems, dynamic errors are by far the largest contributors to registration errors, but static errors cannot be ignored either. The next two sections discuss static and dynamic errors and what has been done to reduce them. See [Holloway95] for a thorough analysis of the sources and magnitudes of registration errors.
4.2 Static errors
The four main sources of static errors are:
• Optical distortion • Errors in the tracking system • Mechanical misalignments
19
• Incorrect viewing parameters (e.g., field of view, tracker-to-eye position and orientation, interpupillary distance)
1) Distortion in the optics: Optical distortions exist in most camera and lens systems, both in the cameras that record the real environment and in the optics used for the display. Because distortions are usually a function of the radial distance away from the optical axis, wide field-of-view displays can be especially vulnerable to this error. Near the center of the field-of-view, images are relatively undistorted, but far away from the center, image distortion can be large. For example, straight lines may appear curved. In a see-through HMD with narrow field-of-view displays, the optical combiners add virtually no distortion, so the user's view of the real world is not warped. However, the optics used to focus and magnify the graphic images from the display monitors can introduce distortion. This mapping of distorted virtual images on top of an undistorted view of the real world causes static registration errors. The cameras and displays may also have nonlinear distortions that cause errors [Deering92].
Optical distortions are usually systematic errors, so they can be mapped and compensated. This mapping may not be trivial, but it is often possible. For example, [Robinett92b] describes the distortion of one commonly-used set of HMD optics. The distortions might be compensated by additional optics. [Edwards93] describes such a design for a video see-through HMD. This can be a difficult design problem, though, and it will add weight, which is not desirable in HMDs. An alternate approach is to do the compensation digitally. This can be done by image warping techniques, both on the digitized video and the graphic images. Typically, this involves predistorting the images so that they will appear undistorted after being displayed [Watson95]. Another way to perform digital compensation on the graphics is to apply the predistortion functions on the vertices of the polygons, in screen space, before rendering [Rolland93]. This requires subdividing polygons that cover large areas in screen space. Both digital compensation methods can be computationally expensive, often requiring special hardware to accomplish in real time. Holloway determined that the additional system delay required by the distortion compensation adds more registration error than the distortion compensation removes, for typical head motion [Holloway95].
2) Errors in the tracking system: Errors in the reported outputs from the tracking and sensing systems are often the most serious type of static registration errors. These distortions are not easy to measure and eliminate, because that requires another "3-D ruler" that is more accurate than the tracker being tested. These errors are often non-systematic and difficult to fully characterize. Almost all commerciallyavailable tracking systems are not accurate enough to satisfy the requirements of AR systems. Section 5 discusses this important topic further.
3) Mechanical misalignments: Mechanical misalignments are discrepancies between the model or specification of the hardware and the actual physical properties of the real system. For example, the combiners, optics, and monitors in an optical see-through HMD may not be at the expected distances or orientations with respect to each other. If the frame is not sufficiently rigid, the various component parts may change their relative positions as the user moves around, causing errors. Mechanical misalignments can cause subtle changes in the position and orientation of the
20
projected virtual images that are difficult to compensate. While some alignment errors can be calibrated, for many others it may be more effective to "build it right" initially.
4) Incorrect viewing parameters: Incorrect viewing parameters, the last major source of static registration errors, can be thought of as a special case of alignment errors where calibration techniques can be applied. Viewing parameters specify how to convert the reported head or camera locations into viewing matrices used by the scene generator to draw the graphic images. For an HMD-based system, these parameters include:
• Center of projection and viewport dimensions • Offset, both in translation and orientation, between the location of the
head tracker and the user's eyes • Field of view
Incorrect viewing parameters cause systematic static errors. Take the example of a head tracker located above a user's eyes. If the vertical translation offsets between the tracker and the eyes are too small, all the virtual objects will appear lower than they should.
In some systems, the viewing parameters are estimated by manual adjustments, in a non-systematic fashion. Such approaches proceed as follows: place a real object in the environment and attempt to register a virtual object with that real object. While wearing the HMD or positioning the cameras, move to one viewpoint or a few selected viewpoints and manually adjust the location of the virtual object and the other viewing parameters until the registration "looks right." This may achieve satisfactory results if the environment and the viewpoint remain static. However, such approaches require a skilled user and generally do not achieve robust results for many viewpoints. Achieving good registration from a single viewpoint is much easier than registration from a wide variety of viewpoints using a single set of parameters. Usually what happens is satisfactory registration at one viewpoint, but when the user walks to a significantly different viewpoint, the registration is inaccurate because of incorrect viewing parameters or tracker distortions. This means many different sets of parameters must be used, which is a less than satisfactory solution.
Another approach is to directly measure the parameters, using various measuring tools and sensors. For example, a commonly-used optometrist's tool can measure the interpupillary distance. Rulers might measure the offsets between the tracker and eye positions. Cameras could be placed where the user's eyes would normally be in an optical see-through HMD. By recording what the camera sees, through the see-through HMD, of the real environment, one might be able to determine several viewing parameters. So far, direct measurement techniques have enjoyed limited success [Janin93].
View-based tasks are another approach to calibration. These ask the user to perform various tasks that set up geometric constraints. By performing several tasks, enough information is gathered to determine the viewing parameters. For example, [Azuma94] asked a user wearing an optical see-through HMD to look straight
21
through a narrow pipe mounted in the real environment. This sets up the constraint that the user's eye must be located along a line through the center of the pipe. Combining this with other tasks created enough constraints to measure all the viewing parameters. [Caudell92] used a different set of tasks, involving lining up two circles that specified a cone in the real environment. [Oishi96] moves virtual cursors to appear on top of beacons in the real environment. All view-based tasks rely upon the user accurately performing the specified task and assume the tracker is accurate. If the tracking and sensing equipment is not accurate, then multiple measurements must be taken and optimizers used to find the "best-fit" solution [Janin93].
For video-based systems, an extensive body of literature exists in the robotics and photogrammetry communities on camera calibration techniques; see the references in [Lenz88] for a start. Such techniques compute a camera's viewing parameters by taking several pictures of an object of fixed and sometimes unknown geometry. These pictures must be taken from different locations. Matching points in the 2-D images with corresponding 3-D points on the object sets up mathematical constraints. With enough pictures, these constraints determine the viewing parameters and the 3-D location of the calibration object. Alternately, they can serve to drive an optimization routine that will search for the best set of viewing parameters that fits the collected data. Several AR systems have used camera calibration techniques, including [ARGOS94] [Bajura93] [Drascic91] [Tuceryan95] [Whitaker95].
4.3 Dynamic errors
Dynamic errors occur because of system delays, or lags. The end-to-end system delay is defined as the time difference between the moment that the tracking system measures the position and orientation of the viewpoint to the moment when the generated images corresponding to that position and orientation appear in the displays. These delays exist because each component in an Augmented Reality system requires some time to do its job. The delays in the tracking subsystem, the communication delays, the time it takes the scene generator to draw the appropriate images in the frame buffers, and the scanout time from the frame buffer to the displays all contribute to end-to-end lag. End-to-end delays of 100 ms are fairly typical on existing systems. Simpler systems can have less delay, but other systems have more. Delays of 250 ms or more can exist on slow, heavily loaded, or networked systems.
End-to-end system delays cause registration errors only when motion occurs. Assume that the viewpoint and all objects remain still. Then the lag does not cause registration errors. No matter how long the delay is, the images generated are appropriate, since nothing has moved since the time the tracker measurement was taken. Compare this to the case with motion. For example, assume a user wears a see-through HMD and moves her head. The tracker measures the head at an initial time t. The images corresponding to time t will not appear until some future time t2, because of the end-to-end system delays. During this delay, the user's head remains in motion, so when the images computed at time t finally appear, the user sees them at a different location than the one they were computed for. Thus, the images are
22
incorrect for the time they are actually viewed. To the user, the virtual objects appear to "swim around" and "lag behind" the real objects. This was graphically demonstrated in a videotape of UNC's ultrasound experiment shown at SIGGRAPH '92 [Bajura92]. In Figure 17, the picture on the left shows what the registration looks like when everything stands still. The virtual gray trapezoidal region represents what the ultrasound wand is scanning. This virtual trapezoid should be attached to the tip of the real ultrasound wand. This is the case in the picture on the left, where the tip of the wand is visible at the bottom of the picture, to the left of the "UNC" letters. But when the head or the wand moves, large dynamic registration errors occur, as shown in the picture on the right. The tip of the wand is now far away from the virtual trapezoid. Also note the motion blur in the background, which is caused by the user's head motion.
Figure 17: Effect of motion and system delays on registration. Picture on the left is a static scene. Picture on the right shows motion. (Courtesy UNC Chapel
Hill Dept. of Computer Science)
System delays seriously hurt the illusion that the real and virtual worlds coexist because they cause large registration errors. With a typical end-to-end lag of 100 ms and a moderate head rotation rate of 50 degrees per second, the angular dynamic error is 5 degrees. At a 68 cm arm length, this results in registration errors of almost 60 mm. System delay is the largest single source of registration error in existing AR systems, outweighing all others combined [Holloway95].
Methods used to reduce dynamic registration fall under four main categories:
• Reduce system lag • Reduce apparent lag • Match temporal streams (with video-based systems) • Predict future locations
1) Reduce system lag: The most direct approach is simply to reduce, or ideally eliminate, the system delays. If there are no delays, there are no dynamic errors. Unfortunately, modern scene generators are usually built for throughput, not minimal latency [Foley90]. It is sometimes possible to reconfigure the software to sacrifice throughput to minimize latency. For example, the SLATS system completes rendering a pair of interlaced NTSC images in one field time (16.67 ms) on PixelPlanes 5 [Olano95]. Being careful about synchronizing pipeline tasks can also reduce the end-to-end lag [Wloka95a].
23
System delays are not likely to completely disappear anytime soon. Some believe that the current course of technological development will automatically solve this problem. Unfortunately, it is difficult to reduce system delays to the point where they are no longer an issue. Recall that registration errors must be kept to a small fraction of a degree. At the moderate head rotation rate of 50 degrees per second, system lag must be 10 ms or less to keep angular errors below 0.5 degrees. Just scanning out a frame buffer to a display at 60 Hz requires 16.67 ms. It might be possible to build an HMD system with less than 10 ms of lag, but the drastic cut in throughput and the expense required to construct the system would make alternate solutions attractive. Minimizing system delay is important, but reducing delay to the point where it is no longer a source of registration error is not currently practical.
2) Reduce apparent lag: Image deflection is a clever technique for reducing the amount of apparent system delay for systems that only use head orientation [Burbidge89] [Regan94] [Riner92] [So92]. It is a way to incorporate more recent orientation measurements into the late stages of the rendering pipeline. Therefore, it is a feed-forward technique. The scene generator renders an image much larger than needed to fill the display. Then just before scanout, the system reads the most recent orientation report. The orientation value is used to select the fraction of the frame buffer to send to the display, since small orientation changes are equivalent to shifting the frame buffer output horizontally and vertically.
Image deflection does not work on translation, but image warping techniques might [Chen93] [McMillan95a] [McMillan95b]. After the scene generator renders the image based upon the head tracker reading, small adjustments in orientation and translation could be done after rendering by warping the image. These techniques assume knowledge of the depth at every pixel, and the warp must be done much more quickly than rerendering the entire image.
3) Match temporal streams: In video-based AR systems, the video camera and digitization hardware impose inherent delays on the user's view of the real world. This is potentially a blessing when reducing dynamic errors, because it allows the temporal streams of the real and virtual images to be matched. Additional delay is added to the video from the real world to match the scene generator delays in generating the virtual images. This additional delay to the video streeam will probably not remain constant, since the scene generator delay will vary with the complexity of the rendered scene. Therefore, the system must dynamically synchronize the two streams.
Note that while this reduces conflicts between the real and virtual, now both the real and virtual objects are delayed in time. While this may not be bothersome for small delays, it is a major problem in the related area of telepresence systems and will not be easy to overcome. For long delays, this can produce negative effects such as pilot-induced oscillation.
4) Predict: The last method is to predict the future viewpoint and object locations. If the future locations are known, the scene can be rendered with these future locations, rather than the measured locations. Then when the scene finally appears, the viewpoints and objects have moved to the predicted locations, and the graphic images are correct at the time they are viewed. For short system delays
24
(under ~80 ms), prediction has been shown to reduce dynamic errors by up to an order of magnitude [Azuma94]. Accurate predictions require a system built for realtime measurements and computation. Using inertial sensors makes predictions more accurate by a factor of 2-3. Predictors have been developed for a few AR systems [Emura94] [Zikan94b], but the majority were implemented and evaluated with VE systems (see the reference list in [Azuma94]). More work needs to be done on ways of comparing the theoretical performance of various predictors [Azuma95a] [Azuma95b] and in developing prediction models that better match actual head motion [Wu95].
4.4 Vision-based techniques
Mike Bajura and Ulrich Neumann [Bajura95] point out that registration based solely on the information from the tracking system is like building an "open-loop" controller. The system has no feedback on how closely the real and virtual actually match. Without feedback, it is difficult to build a system that achieves perfect matches. However, video-based approaches can use image processing or computer vision techniques to aid registration. Since video-based AR systems have a digitized image of the real environment, it may be possible to detect features in the environment and use those to enforce registration. They call this a "closed-loop" approach, since the digitized image provides a mechanism for bringing feedback into the system.
This is not a trivial task. This detection and matching must run in real time and must be robust. This often requires special hardware and sensors. However, it is also not an "AI-complete" problem because this is simpler than the general computer vision problem.
For example, in some AR applications it is acceptable to place fiducials in the environment. These fiducials may be LEDs [Bajura95] or special markers [Mellor95a] [Mellor95b] [Neumann96]. Recent ultrasound experiments at UNC Chapel Hill have used colored dots as fiducials [State96a]. The locations or patterns of the fiducials are assumed to be known. Image processing detects the locations of the fiducials, then those are used to make corrections that enforce proper registration.
These routines assume that one or more fiducials are visible at all times; without them, the registration can fall apart. But when the fiducials are visible, the results can be accurate to one pixel, which is as about close as one can get with video techniques. Figure 18, taken from [Bajura95], shows a virtual arrow and a virtual chimney exactly aligned with their desired points on two real objects. The real objects each have an LED to aid the registration. Figures 19 through 21 show registration from [Mellor95a], which uses dots with a circular pattern as the fiducials. The registration is also nearly perfect. Figure 22 demonstrates merging virtual objects with the real environment, using colored dots as the fiducials in a video-based approach. In the picture on the left, the stack of cards in the center are real, but the ones on the right are virtual. Notice that they penetrate one of the blocks. In the image on the right, a virtual spiral object interpenetrates the real blocks and table and also casts virtual shadows upon the real objects [State96a].
25
Figure 18: A virtual arrow and virtual chimney aligned with two real objects. (Courtesy Mike Bajura, UNC Chapel Hill Dept. of Computer Science, and Ulrich Neumann, USC)
Figure 19: Real skull with five fiducials. (Courtesy J.P. Mellor, MIT AI Lab)
26
Figure 20: Virtual wireframe skull registered with real skull. (Courtesy J.P. Mellor, MIT AI Lab)
Figure 21: Virtual wireframe skull registered with real skull moved to a different position. (Courtesy J.P. Mellor, MIT AI Lab)
27
Figure 22: Virtual cards and spiral object merged with real blocks and table. (Courtesy Andrei State, UNC Chapel Hill Dept. of Computer Science.)
Instead of fiducials, [Uenohara95] uses template matching to achieve registration. Template images of the real object are taken from a variety of viewpoints. These are used to search the digitized image for the real object. Once that is found, a virtual wireframe can be superimposed on the real object.
Recent approaches in video-based matching avoid the need for any calibration. [Kutukalos96] represents virtual objects in a non-Euclidean, affine frame of reference that allows rendering without knowledge of camera parameters. [Iu96] extracts contours from the video of the real world, then uses an optimization technique to match the contours of the rendered 3-D virtual object with the contour extracted from the video. Note that calibration-free approaches may not recover all the information required to perform all potential AR tasks. For example, these two approaches do not recover true depth information, which is useful when compositing the real and the virtual.
Techniques that use fiducials as the sole tracking source determine the relative projective relationship between the objects in the environment and the video camera. While this is enough to ensure registration, it does not provide all the information one might need in some AR applications, such as the absolute (rather than relative) locations of the objects and the camera. Absolute locations are needed to include virtual and real objects that are not tracked by the video camera, such as a 3-D pointer or other virtual objects not directly tied to real objects in the scene.
Additional sensors besides video cameras can aid registration. Both [Mellor95a] [Mellor95b] and [Grimson94] [Grimson95] use a laser rangefinder to acquire an initial depth map of the real object in the environment. Given a matching virtual model, the system can match the depth maps from the real and virtual until they are properly aligned, and that provides the information needed for registration.
Another way to reduce the difficulty of the problem is to accept the fact that the system may not be robust and may not be able to perform all tasks automatically. Then it can ask the user to perform certain tasks. The system in [Sharma94] expects manual intervention when the vision algorithms fail to identify a part because the view is obscured. The calibration techniques in [Tuceryan95] are heavily based on computer vision techniques, but they ask the user to manually intervene by specifying correspondences when necessary.
28
4.5 Current status
The registration requirements for AR are difficult to satisfy, but a few systems have achieved good results. [Azuma94] is an open-loop system that shows registration typically within ±5 millimeters from many viewpoints for an object at about arm's length. Closed-loop systems, however, have demonstrated nearly perfect registration, accurate to within a pixel [Bajura95] [Mellor95a] [Mellor95b] [Neumann96] [State96a].
The registration problem is far from solved. Many systems assume a static viewpoint, static objects, or even both. Even if the viewpoint or objects are allowed to move, they are often restricted in how far they can travel. Registration is shown under controlled circumstances, often with only a small number of real-world objects, or where the objects are already well-known to the system. For example, registration may only work on one object marked with fiducials, and not on any other objects in the scene. Much more work needs to be done to increase the domains in which registration is robust. Duplicating registration methods remains a nontrivial task, due to both the complexity of the methods and the additional hardware required. If simple yet effective solutions could be developed, that would speed the acceptance of AR systems.
5. Sensing
Accurate registration and positioning of virtual objects in the real environment requires accurate tracking of the user's head and sensing the locations of other objects in the environment. The biggest single obstacle to building effective Augmented Reality systems is the requirement of accurate, long-range sensors and trackers that report the locations of the user and the surrounding objects in the environment. For details of tracking technologies, see the surveys in [Ferrin91] [Meyer92] and Chapter 5 of [Durlach95]. Commercial trackers are aimed at the needs of Virtual Environments and motion capture applications. Compared to those two applications, Augmented Reality has much stricter accuracy requirements and demands larger working volumes. No tracker currently provides high accuracy at long ranges in real time. More work needs to be done to develop sensors and trackers that can meet these stringent requirements.
Specifically, AR demands more from trackers and sensors in three areas:
• Greater input variety and bandwidth • Higher accuracy • Longer range
29
5.1 Input variety and bandwidth
VE systems are primarily built to handle output bandwidth: the images displayed, sounds generated, etc. The input bandwidth is tiny: the locations of the user's head and hands, the outputs from the buttons and other control devices, etc. AR systems, however, will need a greater variety of input sensors and much more input bandwidth [Buxton93]. There are a greater variety of possible input sensors than output displays. Outputs are limited to the five human senses. Inputs can come from anything a sensor can detect. Robinett speculates that Augmented Reality may be useful in any application that requires displaying information not directly available or detectable by human senses by making that information visible (or audible, touchable, etc.) [Robinett92a]. Recall that the proposed medical applications in Section 2.1 use CT, MRI and ultrasound sensors as inputs. Other future applications might use sensors to extend the user's visual range into infrared or ultraviolet frequencies, and remote sensors would let users view objects hidden by walls or hills. Conceptually, anything not detectable by human senses but detectable by machines might be transduced into something that a user can sense in an AR system.
Range data is a particular input that is vital for many AR applications [Aliaga97] [Breen96]. The AR system knows the distance to the virtual objects, because that model is built into the system. But the AR system may not know where all the real objects are in the environment. The system might assume that the entire environment is measured at the beginning and remains static thereafter. However, some useful applications will require a dynamic environment, in which real objects move, so the objects must be tracked in real time. However, for some applications a depth map of the real environment would be sufficient. That would allow real objects to occlude virtual objects through a pixel-by-pixel depth value comparison. Acquiring this depth map in real time is not trivial. Sensors like laser rangefinders might be used. Many computer vision techniques for recovering shape through various strategies (e.g., "shape from stereo," or "shape from shading") have been tried. A recent work [Wloka95b] uses intensity-based matching from a pair of stereo images to do depth recovery. Recovering depth through existing vision techniques is difficult to do robustly in real time.
Finally, some annotation applications require access to a detailed database of the environment, which is a type of input to the system. For example, the architectural application of "seeing into the walls" assumes that the system has a database of where all the pipes, wires and other hidden objects are within the building. Such a database may not be readily available, and even if it is, it may not be in a format that is easily usable. For example, the data may not be grouped to segregate the parts of the model that represent wires from the parts that represent pipes. Thus, a significant modelling effort may be required and should be taken into consideration when building an AR application.
5.2 High accuracy
The accuracy requirements for the trackers and sensors are driven by the accuracies needed for visual registration, as described in Section 4. For many
30
approaches, the registration is only as accurate as the tracker. Therefore, the AR system needs trackers that are accurate to around a millimeter and a tiny fraction of a degree, across the entire working range of the tracker.
Few trackers can meet this specification, and every technology has weaknesses. Some mechanical trackers are accurate enough, although they tether the user to a limited working volume. Magnetic trackers are vulnerable to distortion by metal in the environment, which exists in many desired AR application environments. Ultrasonic trackers suffer from noise and are difficult to make accurate at long ranges because of variations in the ambient temperature. Optical technologies [Janin94] have distortion and calibration problems. Inertial trackers drift with time. Of the individual technologies, optical technologies show the most promise due to trends toward high-resolution digital cameras, real-time photogrammetric techniques, and structured light sources that result in more signal strength at long distances. Future tracking systems that can meet the stringent requirements of AR will probably be hybrid systems [Azuma93] [Durlach95] [Foxlin96] [Zikan94b], such as a combination of inertial and optical technologies. Using multiple technologies opens the possibility of covering for each technology's weaknesses by combining their strengths.
Attempts have been made to calibrate the distortions in commonly-used magnetic tracking systems [Bryson92] [Ghazisaedy95]. These have succeeded at removing much of the gross error from the tracker at long ranges, but not to the level required by AR systems [Holloway95]. For example, mean errors at long ranges can be reduced from several inches to around one inch.
The requirements for registering other sensor modes are not nearly as stringent. For example, the human auditory system is not very good at localizing deep bass sounds, which is why subwoofer placement is not critical in a home theater system.
5.3 Long range
Few trackers are built for accuracy at long ranges, since most VE applications do not require long ranges. Motion capture applications track an actor's body parts to control a computer-animated character or for the analysis of an actor's movements. This is fine for position recovery, but not for orientation. Orientation recovery is based upon the computed positions. Even tiny errors in those positions can cause orientation errors of a few degrees, which is too large for AR systems.
Two scalable tracking systems for HMDs have been described in the literature [Ward92] [Sowizral93]. A scalable system is one that can be expanded to cover any desired range, simply by adding more modular components to the system. This is done by building a cellular tracking system, where only nearby sources and sensors are used to track a user. As the user walks around, the set of sources and sensors changes, thus achieving large working volumes while avoiding long distances between the current working set of sources and sensors. While scalable trackers can
31
be effective, they are complex and by their very nature have many components, making them relatively expensive to construct.
The Global Positioning System (GPS) is used to track the locations of vehicles almost anywhere on the planet. It might be useful as one part of a long range tracker for AR systems. However, by itself it will not be sufficient. The best reported accuracy is approximately one centimeter, assuming that many measurements are integrated (so that accuracy is not generated in real time), when GPS is run in differential mode. That is not sufficiently accurate to recover orientation from a set of positions on a user.
Tracking an AR system outdoors in real time with the required accuracy has not been demonstrated and remains an open problem.
6. Future directions
This section identifies areas and approaches that require further research to produce improved AR systems.
Hybrid approaches: Future tracking systems may be hybrids, because combining approaches can cover weaknesses. The same may be true for other problems in AR. For example, current registration strategies generally focus on a single strategy. Future systems may be more robust if several techniques are combined. An example is combining vision-based techniques with prediction. If the fiducials are not available, the system switches to open-loop prediction to reduce the registration errors, rather than breaking down completely. The predicted viewpoints in turn produce a more accurate initial location estimate for the vision-based techniques.
Real-time systems and time-critical computing: Many VE systems are not truly run in real time. Instead, it is common to build the system, often on UNIX, and then see how fast it runs. This may be sufficient for some VE applications. Since everything is virtual, all the objects are automatically synchronized with each other. AR is a different story. Now the virtual and real must be synchronized, and the real world "runs" in real time. Therefore, effective AR systems must be built with realtime performance in mind. Accurate timestamps must be available. Operating systems must not arbitrarily swap out the AR software process at any time, for arbitrary durations. Systems must be built to guarantee completion within specified time budgets, rather than just "running as quickly as possible." These are characteristics of flight simulators and a few VE systems [Krueger92]. Constructing and debugging real-time systems is often painful and difficult, but the requirements for AR demand real-time performance.
Perceptual and psychophysical studies: Augmented Reality is an area ripe for psychophysical studies. How much lag can a user detect? How much registration error is detectable when the head is moving? Besides questions on perception, psychological experiments that explore performance issues are also needed. How
32
much does head-motion prediction improve user performance on a specific task? How much registration error is tolerable for a specific application before performance on that task degrades substantially? Is the allowable error larger while the user moves her head versus when she stands still? Furthermore, not much is known about potential optical illusions caused by errors or conflicts in the simultaneous display of real and virtual objects [Durlach95].
Few experiments in this area have been performed. Jannick Rolland, Frank Biocca and their students conducted a study of the effect caused by eye displacements in video see-through HMDs [Rolland95]. They found that users partially adapted to the eye displacement, but they also had negative aftereffects after removing the HMD. Steve Ellis' group at NASA Ames has conducted work on perceived depth in a see-through HMD [Ellis94] [Ellis95]. ATR has also conducted a study [Utsumi94].
Portability: Section 3.4 explained why some potential AR applications require giving the user the ability to walk around large environments, even outdoors. This requires making the equipment self-contained and portable. Existing tracking technology is not capable of tracking a user outdoors at the required accuracy.
Multimodal displays: Almost all work in AR has focused on the visual sense: virtual graphic objects and overlays. But Section 3.1 explained that augmentation might apply to all other senses as well. In particular, adding and removing 3-D sound is a capability that could be useful in some AR applications.
Social and political issues: Technological issues are not the only ones that need to be considered when building a real application. There are also social and political dimensions when getting new technologies into the hands of real users. Sometimes, perception is what counts, even if the technological reality is different. For example, if workers perceive lasers to be a health risk, they may refuse to use a system with lasers in the display or in the trackers, even if those lasers are eye safe. Ergonomics and ease of use are paramount considerations. Whether AR is truly a cost-effective solution in its proposed applications has yet to be determined. Another important factor is whether or not the technology is perceived as a threat to jobs, as a replacement for workers, especially with many corporations undergoing recent layoffs. AR may do well in this regard, because it is intended as a tool to make the user's job easier, rather than something that completely replaces the human worker. Although technology transfer is not normally a subject of academic papers, it is a real problem. Social and political concerns should not be ignored during attempts to move AR out of the research lab and into the hands of real users.
7. Conclusion
Augmented Reality is far behind Virtual Environments in maturity. Several commercial vendors sell complete, turnkey Virtual Environment systems. However, no commercial vendor currently sells an HMD-based Augmented Reality system. A few monitor-based "virtual set" systems are available, but today AR systems are primarily found in academic and industrial research laboratories.
33
The first deployed HMD-based AR systems will probably be in the application of aircraft manufacturing. Both Boeing [ARPA95] [BoeingTRP94] and McDonnell Douglas [Neumann96] are exploring this technology. The former uses optical approaches, while the latter is pursuing video approaches. Boeing has performed trial runs with workers using a prototype system but has not yet made any deployment decisions. Annotation and visualization applications in restricted, limited-range environments are deployable today, although much more work needs to be done to make them cost effective and flexible. Applications in medical visualization will take longer. Prototype visualization aids have been used on an experimental basis, but the stringent registration requirements and ramifications of mistakes will postpone common usage for many years. AR will probably be used for medical training before it is commonly used in surgery.
The next generation of combat aircraft will have Helmet-Mounted Sights with graphics registered to targets in the environment [Wanstall89]. These displays, combined with short-range steerable missiles that can shoot at targets off-boresight, give a tremendous combat advantage to pilots in dogfights. Instead of having to be directly behind his target in order to shoot at it, a pilot can now shoot at anything within a 60-90 degree cone of his aircraft's forward centerline. Russia and Israel currently have systems with this capability, and the U.S. is expected to field the AIM9X missile with its associated Helmet-Mounted Sight in 2002 [Dornheim95a] [Dornheim95b]. Registration errors due to delays are a major problem in this application [Dornheim95c].
Augmented Reality is a relatively new field, where most of the research efforts have occurred in the past four years, as shown by the references listed at the end of this paper. The SIGGRAPH "Rediscovering Our Fire" report identified Augmented Reality as one of four areas where SIGGRAPH should encourage more submissions [Mair94]. Because of the numerous challenges and unexplored avenues in this area, AR will remain a vibrant area of research for at least the next several years.
One area where a breakthrough is required is tracking an HMD outdoors at the accuracy required by AR. If this is accomplished, several interesting applications will become possible. Two examples are described here: navigation maps and visualization of past and future environments.
The first application is a navigation aid to people walking outdoors. These individuals could be soldiers advancing upon their objective, hikers lost in the woods, or tourists seeking directions to their intended destination. Today, these individuals must pull out a physical map and associate what they see in the real environment around them with the markings on the 2D map. If landmarks are not easily identifiable, this association can be difficult to perform, as anyone lost in the woods can attest. An AR system makes navigation easier by performing the association step automatically. If the user's position and orientation are known, and the AR system has access to a digital map of the area, then the AR system can draw the map in 3-D directly upon the user's view. The user looks at a nearby mountain and sees graphics directly overlaid on the real environment explaining the mountain's name, how tall it is, how far away it is, and where the trail is that leads to the top.
34
The second application is visualization of locations and events as they were in the past or as they will be after future changes are performed. Tourists that visit historical sites, such as a Civil War battlefield or the Acropolis in Athens, Greece, do not see these locations as they were in the past, due to changes over time. It is often difficult for a modern visitor to imagine what these sites really looked like in the past. To help, some historical sites stage "Living History" events where volunteers wear ancient clothes and reenact historical events. A tourist equipped with an outdoors AR system could see a computer-generated version of Living History. The HMD could cover up modern buildings and monuments in the background and show, directly on the grounds at Gettysburg, where the Union and Confederate troops were at the fateful moment of Pickett's charge. The gutted interior of the modern Parthenon would be filled in by computer-generated representations of what it looked like in 430 BC, including the long-vanished gold statue of Athena in the middle. Tourists and students walking around the grounds with such AR displays would gain a much better understanding of these historical sites and the important events that took place there. Similarly, AR displays could show what proposed architectural changes would look like before they are carried out. An urban designer could show clients and politicians what a new stadium would look like as they walked around the adjoining neighborhood, to better understand how the stadium project will affect nearby residents.
After the basic problems with AR are solved, the ultimate goal will be to generate virtual objects that are so realistic that they are virtually indistinguishable from the real environment [Fournier94]. Photorealism has been demonstrated in feature films, but accomplishing this in an interactive application will be much harder. Lighting conditions, surface reflections, and other properties must be measured automatically, in real time. More sophisticated lighting, texturing, and shading capabilities must run at interactive rates in future scene generators. Registration must be nearly perfect, without manual intervention or adjustments. While these are difficult problems, they are probably not insurmountable. It took about 25 years to progress from drawing stick figures on a screen to the photorealistic dinosaurs in "Jurassic Park." Within another 25 years, we should be able to wear a pair of AR glasses outdoors to see and interact with photorealistic dinosaurs eating a tree in our backyard.
Acknowledgements
This paper is an updated version of my course notes for an ACM SIGGRAPH '95 class called "Developing Advanced Virtual Reality Applications," given in Los Angeles, CA, in August 1995.
I thank the anonymous reviewers for their constructive criticism and suggestions for improving this paper. I also thank the following individuals and organizations for sending pictures to include with this paper:
• Mike Bajura, Andrei State, and Linda Houseman, University of North Carolina at Chapel Hill Department of Computer Science
35
• David Drascic and Paul Milgram, University of Toronto
• Steve Feiner and Blair MacIntyre, Columbia University
• Alessandro Giacalone, The European Computer-Industry Research Centre (ECRC) and David Breen, Caltech
• J.P. Mellor, MIT AI Laboratory
• David Mizell, Boeing
• Ulrich Neumann, University of Southern California
• Jannick Rolland, Center for Research and Engineering in Optics and Lasers (CREOL) at the University of Central Florida (rolland@creol.ucf.edu)
References and Bibliography
Note: some of these references are available electronically at the following sites on the World Wide Web:
Columbia U. ECRC MIT AI Lab UNC Chapel Hill U. Toronto
http://www.cs.columbia.edu/graphics/ http://www.ecrc.de/ http://www.ai.mit.edu/ http://www.cs.unc.edu/ http://vered.rose.utoronto.ca/etc-lab.html/
Ahlers95
Ahlers, Klaus H., André Kramer, David E. Breen, Pierre-Yves Chevalier, Chris Crampton, Eric Rose, Mihran Tuceryan, Ross T. Whitaker and Douglas Greer. Distributed Augmented Reality for Collaborative Design Applications. Proceedings of Eurographics '95 (Maastricht, The Netherlands, August 1995), 3-14.
Aliaga97
Aliaga, Daniel G. Virtual Objects in the Real World. Communications of the ACM 40, 3 (March 1997), 49-54.
ARGOS94
ARGOS Virtual Pointer Camera Calibration Procedure. WWW page = http://vered.rose.utoronto.ca/people/david_dir/POINTER/ Calibration.html
ARPA95
WWW page = http://molothrus.sysplan.com/ESTO/
Azuma93
Azuma, Ronald. Tracking Requirements for Augmented Reality. Communications of the ACM 36, 7 (July 1993), 50-51.
Azuma94
Azuma, Ronald, and Gary Bishop. Improving Static and Dynamic Registration in a See-Through HMD. Proceedings of SIGGRAPH
36
Azuma95a Azuma95b Barfield95
Bajura92
Bajura93 Bajura95 Betting95
BoeingTRP94 Bowskill95 Breen96
94 (Orlando, FL, 24-29 July 1994). In Computer Graphics, Annual Conference Series, 1994, 197-204.
Azuma, Ronald T. Predictive Tracking for Augmented Reality. Ph.D. dissertation. UNC Chapel Hill Department of Computer Science technical report TR95-007 (February 1995).
Azuma, Ronald, and Gary Bishop. A Frequency-Domain Analysis of Head-Motion Prediction. Proceedings of SIGGRAPH 95 (Los Angeles, CA, 6-11 August 1995). In Computer Graphics, Annual Conference Series, 1995, 401-408.
Barfield, Woodrow, Craig Rosenberg, and Wouter A. Lotens. Augmented-Reality Displays. In Barfield, Woodrow and Thomas A. Furness III (editors). Virtual Environments and Advanced Interface Design. Oxford University Press (1995), 542-575. ISBN 0-19-507555-2.
Bajura, Mike, Henry Fuchs, and Ryutarou Ohbuchi. Merging Virtual Reality with the Real World: Seeing Ultrasound Imagery Within the Patient. Proceedings of SIGGRAPH 92 (Chicago, IL, 26-31 July 1992). In Computer Graphics 26, 2 (July 1992), 203210.
Bajura, Mike. Camera Calibration for Video See-Through HeadMounted Display. UNC Chapel Hill Department of Computer Science technical report TR93-048 (July 7, 1993), 6 pages.
Bajura, Michael and Ulrich Neumann. Dynamic Registration Correction in Video-Based Augmented Reality Systems. IEEE Computer Graphics and Applications 15, 5 (September 1995), 5260.
Betting, Fabienne, Jacques Feldmar, Nicholas Ayache, and Frédéric Devernay. A New Framework for Fusing Stereo Images with Volumetric Medical Images. Proceedings of Computer Vision, Virtual Reality, and Robotics in Medicine '95 (CVRMed '95) (Nice, France, 3-6 April 1995), 30-39.
WWW page = http://esto.sysplan.com/ESTO/Displays/HMDTDS/Factsheets/Boeing.html (July 1994).
Bowskill, Jerry and John Downie. Extending the Capabilities of the Human Visual System: An Introduction to Enhanced Reality. Computer Graphics 29, 2 (May 1995), 61-65.
Breen, David E., Ross T. Whitaker, Eric Rose and Mihran Tuceryan. Interactive Occlusion and Automatic Object Placement for Augmented Reality. Proceedings of Eurographics '96 (Futuroscope - Poitiers, France, 26-30 August 1996), 11-22.
37
Brooks96 Bryson92 Burbidge89 Buxton93 Caudell92 Caudell94 Chen93 Deering92 Doenges85 Dornheim95a Dornheim95b
Brooks, Frederick P. Jr. The Computer Scientist as Toolsmith II. CACM 39, 3 (March 1996), 61-68.
Bryson, Steve. Measurement and Calibration of Static Distortion of Position Data from 3D Trackers. Proceedings of SPIE Vol. 1669: Stereoscopic Displays and Applications III (San Jose, CA, 12-13 February 1992), 244-255.
Burbidge, Dick, and Paul M. Murray. Hardware Improvements to the Helmet-Mounted Projector on the Visual Display Research Tool (VDRT) at the Naval Training Systems Center. SPIE Proceedings Vol. 1116 Head-Mounted Displays (1989), 52-59.
Buxton, Bill. Personal communication. MIT Workshop on Ubiquitous Computing and Augmented Reality (Cambridge, MA, 24-25 February 1993).
Caudell, Thomas P. and David W. Mizell. Augmented Reality: An Application of Heads-Up Display Technology to Manual Manufacturing Processes. Proceedings of Hawaii International Conference on System Sciences (January 1992), 659-669.
Caudell, Thomas P. Introduction to Augmented Reality. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 272281.
Chen, Shenchang Eric, and Lance Williams. View Interpolation for Image Synthesis. Proceedings of SIGGRAPH 93 (Anaheim, CA, 1-6 August 1993). In Computer Graphics, Annual Conference Series, 1993, 279-288.
Deering, Michael. High Resolution Virtual Reality. Proceedings of SIGGRAPH '92 (Chicago, IL, 26-31 July 1992). In Computer Graphics 26, 2 (July 1992), 195-202.
Doenges, Peter K. Overview of Computer Image Generation in Visual Simulation. SIGGRAPH '85 Course Notes #14 on High Performance Image Generation Systems (San Francisco, CA, 22 July 1985).
Dornheim, Michael A. and David Hughes. U.S. Intensifies Efforts to Meet Missile Threat. Aviation Week and Space Technology 143, 16 (16 October 1995), 36-39.
Dornheim, Michael A. U.S. Fighters to Get Helmet Displays After 2000. Aviation Week and Space Technology 143, 17 (23 October 1995), 46-48.
38
Dornheim95c Drascic91 Drascic93a
Drascic93b Durlach95 Edwards93 Edwards95 Ellis94
Ellis95 Emura94
Dornheim, Michael A. Helmet-Mounted Sights Must Overcome Delays. Aviation Week and Space Technology 143, 17 (23 October 1995), 54.
Drascic, David and Paul Milgram. Positioning Accuracy of a Virtual Stereographic Pointer in a Real Stereoscopic Video World. SPIE Proceedings Volume 1457 - Stereoscopic Displays and Applications II (San Jose, CA, February 1991), 302-313.
Drascic, D., J.J. Grodski, P. Milgram, K. Ruffo, P. Wong, and S. Zhai. ARGOS: A Display System for Augmenting Reality. Video Proceedings of INTERCHI '93: Human Factors in Computing Systems (Amsterdam, the Netherlands, 24-29 April 1993). Also in ACM SIGGRAPH Technical Video Review, Volume 88. Extended abstract in Proceedings of INTERCHI '93, 521.
Drascic, David. Stereoscopic Vision and Augmented Reality. Scientific Computing & Automation 9, 7 (June 1993), 31-34.
Durlach, Nathaniel I. and Anne S. Mavor (editors). Virtual Reality: Scientific and Technological Challenges. (Report of the Committee on Virtual Reality Research and Development to the National Research Council) National Academy Press (1995). ISBN 0-309-05135-5.
Edwards, Emily, Jannick Rolland, and Kurtis Keller. Video Seethrough Design for Merging of Real and Virtual Environments. Proceedings of IEEE VRAIS '93 (Seattle, WA, 18-22 September 1993), 222-233.
Edwards, P.J., D.L.G. Hill, D.J. Hawkes, R. Spink, A.C.F. Colchester, A. Strong, and M. Gleeson. Neurosurgical Guidance Using the Stereo Microscope. Proceedings of Computer Vision, Virtual Reality, and Robotics in Medicine '95 (CVRMed '95) (Nice, France, 3-6 April 1995), 555-564.
Ellis, Stephen R. and Urs J. Bucher. Distance Perception of Stereoscopically Presented Virtual Objects Optically Superimposed on Physical Objects by a Head-Mounted SeeThrough Display. Proceedings of 38th Annual Meeting of the Human Factors and Ergonomics Society (Nashville, TN, 24-28 October 1994), 1300-1305.
Ellis, Stephen R. and Brian M. Menges. Judged Distance to Virtual Objects in the Near Visual Field. Proceedings of 39th Annual Meeting of the Human Factors and Ergonomics Society (San Diego, CA, 1995), 1400-1404.
Emura, Satoru and Susumu Tachi. Compensation of Time Lag Between Actual and Virtual Spaces by Multi-Sensor Integration.
39
Feiner93a Feiner93b
Feiner94a Feiner94b Feiner95
Ferrin91 Fitzmaurice93 Foley90 Fournier94 Foxlin96 Ghazisaedy95
Proceedings of the 1994 IEEE International Conference on Multisensor Fusion and Integration for Intelligent Systems (Las Vegas, NV, 2-5 October 1994), 463-469.
Feiner, Steven, Blair MacIntyre, and Dorée Seligmann. Knowledge-based Augmented Reality. Communications of the ACM 36, 7 (July 1993), 52-62.
Feiner, Steven, Blair MacIntyre, Marcus Haupt, and Eliot Solomon. Windows on the World: 2D Windows for 3D Augmented Reality. Proceedings of UIST '93 (Atlanta, GA, 3-5 November 1993), 145-155.
Feiner, Steven. Augmented Reality. Course Notes, 2: ACM SIGGRAPH 1994, 7:1-7:11.
Feiner, Steven. Redefining the User Interface: Augmented Reality. Course Notes, 2: ACM SIGGRAPH 1994, 18:1-18:7.
Feiner, Steven K., Anthony C. Webster, Theodore E. Krueger III, Blair MacIntyre, and Edward J. Keller. Architectural Anatomy. Presence: Teleoperators and Virtual Environments 4, 3 (Summer 1995), 318-325.
Ferrin, Frank J. Survey of Helmet Tracking Technologies. SPIE Proceedings Vol. 1456 Large-Screen Projection, Avionic, and Helmet-Mounted Displays (1991), 86-94.
Fitzmaurice, George. Situated Information Spaces: Spatially Aware Palmtop Computers. CACM 36, 7 (July 1993), 38-49.
Foley, James D., Andries van Dam, Steven K. Feiner, and John F. Hughes. Computer Graphics: Principles and Practice (2nd edition). Addison-Wesley (1990).
Fournier, Alain. Illumination Problems in Computer Augmented Reality. Journée INRIA, Analyse / Synthèse D'Images (January 1994), 1-21.
Foxlin, Eric. Inertial Head-Tracker Sensor Fusion by a Complementary Separate-Bias Kalman Filter. Proceedings of VRAIS '96 (Santa Clara, CA, 30 March - 3 April 1996), 185-194.
Ghazisaedy, Morteza, David Adamczyk, Daniel J. Sandin, Robert V. Kenyon, and Thomas A. DeFanti. Ultrasonic Calibration of a Magnetic Tracker in a Virtual Reality Space. Proceedings of VRAIS '95 (Research Triangle Park, NC, 11-15 March 1995), 179188.
40
Grimson94
Grimson95
Holloway95 Holmgren92 Iu96 Jain89 Janin93 Janin94 Kancherla95 Kim93
Grimson, W., T. Lozano-Pérez, W. Wells, G. Ettinger, S. White and R. Kikinis. An Automatic Registration Method for Frameless Stereotaxy, Image Guided Surgery, and Enhanced Reality Visualization. Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (Los Alamitos, CA, June 1994), 430-436.
Grimson, W.E.L., G.J. Ettinger, S.J. White, P.L. Gleason, T. Lozano-Pérez, W.M. Wells III, and R. Kikinis. Evaluating and Validating an Automated Registration System for Enhanced Reality Visualization in Surgery. Proceedings of Computer Vision, Virtual Reality, and Robotics in Medicine '95 (CVRMed '95) (Nice, France, 3-6 April 1995), 3-12.
Holloway, Richard. Registration Errors in Augmented Reality. Ph.D. dissertation. UNC Chapel Hill Department of Computer Science technical report TR95-016 (August 1995).
Holmgren, Douglas E. Design and Construction of a 30-Degree See-Through Head-Mounted Display. UNC Chapel Hill Department of Computer Science technical report TR 92-030 (July 1992), 4 pages.
Iu, Siu-Leong and Kevin W. Rogovin. Registering Perspective Contours with 3-D Objects Without Correspondence Using Orthogonal Polynomials. Proceedings of VRAIS '96 (Santa Clara, CA, 30 March - 3 April 1996), 37-44.
Jain, Anil K. Fundamentals of Digital Image Processing. Prentice Hall (1989). ISBN 0-13-336165-9.
Janin, Adam L., David W. Mizell, and Thomas P. Caudell. Calibration of Head-Mounted Displays for Augmented Reality Applications. Proceedings of IEEE VRAIS '93 (Seattle, WA, 18-22 September 1993), 246-255.
Janin, Adam, Karel Zikan, David Mizell, Mike Banner, and Henry Sowizral. A Videometric Head Tracker for Augmented Reality. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 308-315.
Kancherla, Anantha R., Jannick P. Rolland, Donna L. Wright, and Grigore Burdea. A Novel Virtual Reality Tool for Teaching Dynamic 3D Anatomy. Proceedings of Computer Vision, Virtual Reality, and Robotics in Medicine '95 (CVRMed '95) (Nice, France, 3-6 April 1995), 163-169.
Kim, Won S. Advanced Teleoperation, Graphics Aids, and Application to Time Delay Environments. Proceedings of the 1st
41
Kim96 Krueger92 Kutulakos96 Lenz88
Lion93
Lorensen93 MacKay93 Maes95 Madritsch96 Mair94
Industrial Virtual Reality Show and Conference (IVR '93) (Makuhari Meese, Japan, 23-25 June 1993), 202-207.
Kim, Won S. Virtual Reality Calibration and Preview / Predictive Displays for Telerobotics. Presence: Teleoperators and Virtual Environments 5, 2 (Spring 1996), 173-190.
Krueger, Myron W. Simulation Versus Artificial Reality. Proceedings of IMAGE VI Conference (Scottsdale, AZ, 14-17 July 1992), 147-155.
Kutulakos, Kiriakos N. and James Vallino. Affine Object Representations for Calibration-Free Augmented Reality. Proceedings of VRAIS '96 (Santa Clara, CA, 30 March - 3 April 1996), 25-36.
Lenz, Reimar K. and Roger Y. Tsai. Techniques for Calibration of the Scale Factor and Image Center for High Accuracy 3-D Machine Vision Metrology. IEEE Transactions on Pattern Analysis and Machine Intelligence 10, 5 (September 1988), 713720.
Lion, Dav, Craig Rosenberg, and Woodrow Barfield. Overlaying Three-Dimensional Computer Graphics with Stereoscopic Live Motion Video: Applications for Virtual Environments. Society for Information Display International Symposium Digest of Technical Papers (Seattle, WA, 18-29 May 1993), 483-486.
Lorensen, William, Harvey Cline, Christopher Nafis, Ron Kikinis, David Altobelli, and Langham Gleason. Enhancing Reality in the Operating Room. Proceedings of Visualization '93 (Los Alamitos, CA, October 1993), 410-415.
MacKay, Wendy, Gilles Velay, Kathy Carter, Chaoying Ma, and Daniele Pagani. Augmenting Reality: Adding Computational Dimensions to Paper. CACM 36, 7 (July 1993), 96-97.
Maes, Pattie. Artificial Life Meets Entertainment: Lifelike Autonomous Agents. CACM 38, 11 (November 1995), 108-114.
Madritsch, F. and M. Gervautz. CCD-Camera Based Optical Beacon Tracking for Virtual and Augmented Reality. Proceedings of Eurographics '96 (Futuroscope - Poitiers, France, 26-30 August 1996).
Mair, Susan G. Preliminary Report on SIGGRAPH in the 21st Century: Rediscovering Our Fire. Computer Graphics 28, 4 (November 1994), 288-296.
42
McMillan95a McMillan95b Mellor95a Mellor95b Meyer92 Milgram93
Milgram94a Milgram94b
Milgram95 Mine93
McMillan, Leonard and Gary Bishop. Head-Tracked Stereoscopic Display Using Image Warping. SPIE Proceedings 2409 Electronic Imaging Science and Technology (San Jose, CA, 5-10 February 1995), 21-30.
McMillan, Leonard and Gary Bishop. Plenoptic Modeling. Proceedings of SIGGRAPH 95 (Los Angeles, CA, 6-11 August 1995). In Computer Graphics, Annual Conference Series, 1995, 39-46.
Mellor, J. P. Enhanced Reality Visualization in a Surgical Environment. MS Thesis, Department of Electrical Engineering, MIT (13 January 1995).
Mellor, J.P. Realtime Camera Calibration for Enhanced Reality Visualization. Proceedings of Computer Vision, Virtual Reality, and Robotics in Medicine '95 (CVRMed '95) (Nice, France, 3-6 April 1995), 471-475.
Meyer, Kenneth, Hugh L. Applewhite, and Frank A. Biocca. A Survey of Position-Trackers. Presence: Teleoperators and Virtual Environments 1, 2 (Spring 1992), 173-200.
Milgram, Paul, Shumin Zhai, David Drascic, and Julius J. Grodski. Applications of Augmented Reality for Human-Robot Communication. Proceedings of International Conference on Intelligent Robotics and Systems (Yokohama, Japan, July 1993), 1467-1472.
Milgram, Paul, and Fumio Kishino. A Taxonomy of Mixed Reality Virtual Displays. IEICE Transactions on Information and Systems E77-D, 9 (September 1994), 1321-1329.
Milgram, Paul, Haruo Takemura, Akira Utsumi, and Fumio Kishino. Augmented Reality: A Class of Displays on the RealityVirtuality Continuum. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 282-292.
Milgram, Paul, David Drascic, Julius J. Grodski, Anu Restogi, Shumin Zhai, and Chin Zhou. Merging Real and Virtual Worlds. Proceedings of IMAGINA '95 (Monte Carlo, 1-3 February 1995), 218-230.
Mine, Mark R. Characterization of End-to-End Delays in HeadMounted Display Systems. UNC Chapel Hill Department of Computer Science technical report TR 93-001 (March 1993), 11 pages.
43
Neumann96 Oishi96 Olano95 Oyama93 Pausch92 Peuchot95
Regan94 Rekimoto95a Rekimoto95b Riner92
Neumann, Ulrich and Youngkwan Cho. A Self-Tracking Augmented Reality System. Proceedings of VRST '96 (Hong Kong, 1-4 July 1996), 109-115.
Oishi, Takashi and Susumu Tachi. Methods to Calibrate Projection Transformation Parameters for See-Through HeadMounted Displays. Presence: Teleoperators and Virtual Environments 5, 1 (Winter 1996), 122-135.
Olano, Marc, Jon Cohen, Mark Mine, and Gary Bishop. Combating Graphics System Latency. Proceedings of 1995 Symposium on Interactive 3D Graphics (Monterey, CA, 9-12 April 1995), 19-24.
Oyama, Eimei, Naoki Tsunemoto, Susumu Tachi, and Yasuyuki Inoue. Experimental Study on Remote Manipulation Using Virtual Reality. Presence: Teleoperators and Virtual Environments 2, 2 (Spring 1993), 112-124.
Pausch, Randy, Thomas Crea, and Matthew Conway. A Literature Survey for Virtual Environments: Military Flight Simulator Visual Systems and Simulator Sickness. Presence: Teleoperators and Virtual Environments 1, 3 (Summer 1992), 344-363.
Peuchot, Bernard, Alain Tanguy, and Michel Eude. Virtual Reality as an Operative Tool During Scoliosis Surgery. Proceedings of Computer Vision, Virtual Reality, and Robotics in Medicine '95 (CVRMed '95) (Nice, France, 3-6 April 1995), 549554.
Regan, Matthew, and Ronald Pose. Priority Rendering with a Virtual Reality Address Recalculation Pipeline. Proceedings of SIGGRAPH 94 (Orlando, FL, 24-29 July 1994). In Computer Graphics, Annual Conference Series, 1994, 155-162.
Rekimoto, Jun, and Katashi Nagao. The World Through the Computer: Computer Augmented Interaction with Real World Environments. Proceedings of UIST '95 (Pittsburgh, PA, 14-17 November 1995), 29-36.
Rekimoto, Jun. The Magnifying Glass Approach to Augmented Reality Systems. Proceedings of ICAT '95 (Makuhari Messe, Chiba, Japan, 20-22 November 1995).
Riner, Bruce and Blair Browder. Design Guidelines for a CarrierBased Training System. Proceedings of IMAGE VI (Scottsdale, AZ, July 14-17 1992), 65-73.
44
Robinett92a Robinett92b Rolland93 Rolland94 Rolland95 Rose95 Rosen96 Sharma94 Simon94 Sims94
Robinett, Warren. Synthetic Experience: A Proposed Taxonomy. Presence: Teleoperators and Virtual Environments 1, 2 (Spring 1992), 229-247.
Robinett, Warren and Jannick Rolland. A Computational Model for the Stereoscopic Optics of a Head-Mounted Display. Presence: Teleoperators and Virtual Environments 1, 1 (Winter 1992), 45-62.
Rolland, Jannick P., and Terry Hopkins. A Method of Computational Correction for Optical Distortion in Head-Mounted Displays. UNC Chapel Hill Department of Computer Science technical report TR93-045 (1993).
Rolland, Jannick, Rich Holloway, and Henry Fuchs. A Comparison of Optical and Video See-Through Head-Mounted Displays. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 293307.
Rolland, Jannick, Frank Biocca, Todd Barlow, and Anantha Kancherla. Quantification of Adaptation to Virtual-Eye Location in See-Thru Head-Mounted Displays. Proceedings of IEEE VRAIS '95 (Research Triangle Park, NC, 11-15 March 1995), 56-66.
Rose, Eric, David Breen, Klaus Ahlers, Chris Crampton, Mihran Tuceryan, Ross Whitaker, and Douglas Greer. Annotating RealWorld Objects Using Augmented Reality. Proceedings of Computer Graphics International '95 (Leeds, UK, 25-30 June 1995), 357-370.
Rosen, Joseph M., Hooman Soltanian, Richard J. Redett and Donald R. Laub. Evolution of Virtual Reality: From Planning to Performing Surgery. IEEE Engineering in Medicine and Biology 15, 2 (March / April 1996), 16-22.
Sharma, Rajeev, and Jose Molineros. Role of Computer Vision in Augmented Virtual Reality. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 220-231.
Simon, David A., Martial Hebert, and Takeo Kanade. Techniques for Fast and Accurate Intra-Surgical Registration. Proceedings of the First International Symposium on Medical Robotics and Computer Assisted Surgery (MRCAS) (September 1994), 90-97.
Sims, Dave. New Realities in Aircraft Design and Manufacture. IEEE Computer Graphics and Applications 14, 2 (March 1994), 91.
45
So92 Sowizral93 State94 State96a State96b Taubes94 Tharp94 Torrance95 Tuceryan95 Uenohara95
So, Richard H. Y. and Michael J. Griffin. Compensating Lags in Head-Coupled Displays Using Head Position Prediction and Image Deflection. Journal of Aircraft 29, 6 (November - December 1992), 1064-1068.
Sowizral, Henry, and James Barnes. Tracking Position and Orientation in a Large Volume. Proceedings of IEEE VRAIS '93 (Seattle, WA, 18-22 September 1993), 132-139.
State, Andrei, David T. Chen, Chris Tector, Andrew Brandt, Hong Chen, Ryutarou Ohbuchi, Mike Bajura and Henry Fuchs. Case Study: Observing a Volume Rendered Fetus within a Pregnant Patient. Proceedings of IEEE Visualization '94 (Washington D.C., 17-21 October 1994), 364-368.
State, Andrei, Gentaro Hirota, David T. Chen, Bill Garrett, and Mark Livingston. Superior Augmented Reality Registration by Integrating Landmark Tracking and Magnetic Tracking. Proceedings of SIGGRAPH 96 (New Orleans, LA, 4-9 August 1996), 429-438.
State, Andrei, Mark A. Livingston, Gentaro Hirota, William F. Garrett, Mary C. Whitton, Henry Fuchs and Etta D. Pisano. Techniques for Augmented-Reality Systems: Realizing Ultrasound-Guided Needle Biopsies. Proceedings of SIGGRAPH 96 (New Orleans, LA, 4-9 August 1996), 439-446.
Taubes, Gary. Surgery in Cyberspace. Discover 15, 12 (December 1994), 84-94.
Tharp, Greg, Samad Hayati and Linh Phan. Virtual Window Telepresence System for Telerobotic Inspection. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 366373.
Torrance, Mark C. Advances in Human-Computer Interaction: The Intelligent Room. CHI '95 Research Symposium (Denver, CO, 6-7 May 1995).
Tuceryan, Mihran, Douglas S. Greer, Ross T. Whitaker, David Breen, Chris Crampton, Eric Rose, and Klaus H. Ahlers. Calibration Requirements and Procedures for Augmented Reality. IEEE Transactions on Visualization and Computer Graphics 1, 3 (September 1995), 255-273.
Uenohara, Michihiro and Takeo Kanade. Vision-Based Object Registration for Real-Time Image Overlay. Proceedings of Computer Vision, Virtual Reality, and Robotics in Medicine '95 (CVRMed '95) (Nice, France, 3-6 April 1995), 13-22.
46
Utsumi94
Wanstall89 Ward92
Watson95 Welch78 Wellner93 Whitaker95 Wloka95a Wloka95b Wu95 Yoo93
Utsumi, Akira, Paul Milgram, Haruo Takemura, and Fumio Kishino. Effects of Fuzziness in Perception of Stereoscopically Presented Virtual Object Location. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 337-344.
Wanstall, Brian. HUD on the Head for Combat Pilots. Interavia 44 (April 1989), 334-338. [A89-39227].
Ward, Mark, Ronald Azuma, Robert Bennett, Stefan Gottschalk, and Henry Fuchs. A Demonstrated Optical Tracker with Scalable Work Area for Head-Mounted Display Systems. Proceedings of 1992 Symposium on Interactive 3D Graphics (Cambridge, MA, 29 March - 1 April 1992), 43-52.
Watson, Benjamin and Larry Hodges. Using Texture Maps to Correct for Optical Distortion in Head-Mounted Displays. Proceedings of IEEE VRAIS '95 (Research Triangle Park, NC, 1115 March 1995), 172-178.
Welch, Robert B. Perceptual Modification: Adapting to Altered Sensory Environments. Academic Press (1978). ISBN 0-12741850-4.
Wellner, Pierre. Interacting with Paper on the DigitalDesk. CACM 36, 7 (July 1993), 86-96.
Whitaker, Ross T., Chris Crampton, David E. Breen, Mihran Tuceryan and Eric Rose. Object Calibration for Augmented Reality. Proceedings of Eurographics '95 (Maastricht, The Netherlands, August 1995), 15-27.
Wloka, Matthias M. Lag in Multiprocessor Virtual Reality. Presence: Teleoperators and Virtual Environments 4, 1 (Winter 1995), 50-63.
Wloka, Matthias M. and Brian G. Anderson. Resolving Occlusion in Augmented Reality. Proceedings of 1995 Symposium on Interactive 3D Graphics (Monterey, CA, 9-12 April 1995), 5-12.
Wu, Jiann-Rong and Ming Ouhyoung. A 3D Tracking Experiment on Latency and its Compensation Methods in Virtual Environments. Proceedings of UIST '95 (Pittsburgh, PA, 14-17 November 1995), 41-49.
Yoo, Terry S., and T. Marc Olano. Instant Hole™ (Windows Onto Reality). UNC Chapel Hill Department of Computer Science technical report TR93-027 (1993), 13 pages.
47
Zikan94a Zikan94b
Zikan, Karel, W. Dan Curtis, Henry Sowizral, and Adam Janin. Fusion of Absolute and Incremental Position and Orientation Sensors. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 316-327.
Zikan, Karel, W. Dan Curtis, Henry A. Sowizral, and Adam L. Janin. A Note on Dynamics of Human Head Motions and on Predictive Filtering of Head-Set Orientations. SPIE Proceedings volume 2351: Telemanipulator and Telepresence Technologies (Boston, MA, 31 October - 4 November 1994), 328-336.
48

View File

@@ -0,0 +1,15 @@
Title: A Survey of Augmented Reality
Subject:
Keywords:
Author: Azuma, Ronald T
Creator: Microsoft Word
Producer: Acrobat PDFWriter 2.0 for Macintosh
CreationDate: 08/19/97 09:08:39
Tagged: no
Form: none
Pages: 48
Encrypted: no
Page size: 612 x 792 pts (letter) (rotated 0 degrees)
File size: 490198 bytes
Optimized: no
PDF version: 1.1

View File

@@ -0,0 +1 @@
{"pageIndex":2,"scale":"page-width","top":477,"left":-7,"scrollMode":0,"spreadMode":0}

View File

@@ -0,0 +1,336 @@
One-point Calibration Gaze Tracking Based on Eyeball Kinematics Using Stereo Cameras
Takashi Nagamatsu Kobe University
Junzo Kamahara† Kobe University
Takumi Iko‡ Kobe University
Naoki Tanaka§ Kobe University
Abstract
This paper presents a one-point calibration gaze tracking method based on eyeball kinematics using stereo cameras. By using two cameras and two light sources, the optic axis of the eye can be estimated. One-point calibration is required to estimate the angle of the visual axis from the optic axis. The eyeball rotates with optic and visual axes based on the eyeball kinematics (Listings law). Therefore, we introduced eyeball kinematics to the one-point calibration process in order to properly estimate the visual axis. The prototype system was developed and it was found that the accuracy was under 1 ◦ around the center and bottom of the display.
CR Categories: H.5.2 [Information Interfaces and Presentation]: User InterfacesErgonomics; I.4.9 [Image Processing and Computer Vision]: Applications
Keywords: eye tracking, calibration, stereo camera, eyeball kinematics
1 Introduction
Gaze tracking technology is being used as a human-machine interface[Jacob 1991; Duchowski 2007; Nagamatsu et al. 2007]. However, most accurate gaze tracking systems require personal calibration process before the system can be used. Several works have attempted to reduce the calibration effort by using a model-based approach[Ohno and Mukawa 2004; Shih and Liu 2004; Ohno 2006; Guestrin and Eizenman 2006; Guestrin and Eizenman 2007; Villanueva and Cabeza 2007]. For example, Shih and Liu [2004], and Guestrin and Eizenman[2006; 2007] have presented methods to reconstruct the optic axis of the eye using stereo cameras without actually knowing the personal eye parameters. Since the optic axis is an approximation of the visual axis with an accuracy of approximately 5 ◦, after the reconstruction of the optic axis, the visual axis is estimated by at least a one-point calibration procedure in their work.
However, most previous works did not consider the eyeball kinematics when estimating the visual axis; in contrast, Villanueva and Cabeza[2007] pointed out the importance of eyeball kinematics. In this paper, we describe one-point calibration gaze tracking based on eyeball kinematics using stereo cameras.
2 Estimation of optic axis
In this section, we describe a method for the estimation of the optic axis. The cameras are modeled as pinhole cameras and the light sources are modeled as point sources. The position of the light sources must be measured and the intrinsic and extrinsic camera parameters must be determined in advance by the camera calibration.
2.1 Image processing of eye image
The eye images are captured by two cameras. From the images captured by camera 0, the center of the pupil, B00 , is detected by ellipse fitting, and the centers of the first Purkinje images (the reflection of light sources (L0 and L1) from the outer surface of the cornea), P000 and P001, are detected by seeking near the center of the pupil. Similarly, from the image captured by camera 1, B10 , P100, and P101 are detected, where B10 is the center of the pupil and P100 and P101 are the centers of the first Purkinje images of L0 and L1 captured by camera 1.
2.2 Center of corneal curvature
We estimate the position of the center of the corneal curvature, A. Figure 1 shows a cross section of the eyeball that includes A, light source 0, L0, the nodal point of camera 0, C0. L0 and C0 are known. A ray from L0 reflects at a point P00 on the corneal surface such that the reflected ray passes through C0 and intersects the camera image plane at a point P000. C0, P00, L0, P000, and A are coplanar, and the normal vector of the plane, π00, is (P000 C0) × (L0 C0). The plane can be expressed as
(P000 C0) × (L0 C0) · (x C0) = 0
(1)
where x (= (x, y, z))is the point on the plane.
Similarly, with regard to the relation among the cameras, C0 and C1, the light sources, L0 and L1, and the Purkinje images, P001, P100, and P101, three other planes, π01, π10, and π11 can be expressed respectively as
(P001 C0) × (L1 C0) · (x C0) = 0
(2)
(P100 C1) × (L0 C1) · (x C1) = 0
(3)
(P101 C1) × (L1 C1) · (x C1) = 0.
(4)
e-mail:nagamatu@kobe-u.ac.jp †e-mail:kamahara@maritime.kobe-u.ac.jp ‡e-mail:071w101w@stu.kobe-u.ac.jp §e-mail:ntanaka@maritime.kobe-u.ac.jp
Copyright © 2008 by the Association for Computing Machinery, Inc. Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for components of this work owned by others than ACM must be honored. Abstracting with credit is permitted. To copy otherwise, to republish, to post on servers, or to redistribute to lists, requires prior specific permission and/or a fee. Request permissions from Permissions Dept, ACM Inc., fax +1 (212) 869-0481 or e-mail permissions@acm.org. ETRA 2008, Savannah, Georgia, March 2628, 2008. © 2008 ACM 978-1-59593-982-1/08/0003 $5.00
All planes include A, and A can be known if three planes
are given. Two LEDs were installed to make an angle of 90 ◦ around a camera in order to improve its accuracy.
2.3 Center of pupil
Radius of cornea In order to estimate the radius of the
cornea, we will estimate the reflected point, P00, that is on
the middle line between L0A and C0A, as shown in figure
1. The line AP00 can be expressed in a parametric form as
µ
x = A + t L0 A + C0 A
(5)
|L0 A| |C0 A|
95
Figure 1: Center of the Corneal Curvature.
where t is a parameter. On the other hand, the line from C0 to P00 can be expressed in parametric form as
x = C0 + t (C0 P000) .
(6)
The position of P00 can be estimated as the intersection point of the above two lines. Therefore, the radius of the cornea, r, is determined as
r = |P00 A|.
(7)
Since r can be determined using three other combinations (C0,L1,P001), (C1,L0,P100), and (C1,L1,P101), the average is used for improving the robustness.
Refraction As shown in figure 2, a ray that originates from the center of the pupil, B, refracts at the point B000, passes through the nodal point of camera 0, C0, and intersects the camera image plane at a point B00 . B000 can be determined to solve the equations given below.
x = C0 + t (C0 B00 )
(8)
r = |x A|
(9)
t is solved as follows:
t = b b2 ac
(10)
a
where a = (C0x B00 x)2 + (C0y B00 y)2 + (C0z B00 z)2, b = (C0x A0x)(C0x B00 x)+(C0y A0y)(C0y B00 y)+(C0z A0z)(C0z B00 z), and c = (C0x A0x)2 + (C0y A0y)2 + (C0z A0z)2.
The refracted vector at B000, t0, can be calculated by using
Snells law as follows:
³
p
´
t0 = ρn0 · v0 1 ρ2 (1 (n0 · v0)2) n0 + ρv0
(11) where the incident vector v0 = (C0 B00 )/|C0 B00 |, normal vector at the point of refraction n0 = (B000 A)/|B000 A|, and ρ = n1/n2 (n1: the index of air ≈ 1; n2 : the
effective refractive index ≈ 1.3375).
Center of pupil The center of the pupil, B, can be obtained from the intersection of two rays from the two cameras as follows:
x = B000 + tt0 x = B100 + tt1.
(12) (13)
where B100 and t1 are the refracted point and the refracted vector, respectively (using camera 1).
Figure 2: Refraction.
2.4 Optic Axis The optic axis can be written in parametric form as follows:
x = A + t(B A).
(14)
3 One-Point Calibration
This section describes the calibration process that determines a unit direction vector of an optic axis, b, at the primary position, which is the position of the eye relative to the head when looking straight ahead at an object at eye level. In this paper, the unit direction vector of the visual axis at the primary position is (0, 0, 1)T (= a).
Figure 3 shows the eyeball model of rotation and translation. E0 and A0 are the rotation center of the eyeball and the center of the corneal curvature at the primary position, respectively. E, d, and c are the rotation center of the eyeball, a unit direction vector of the optic axis, and, a unit direction vector of the visual axis after the eye movement, respectively.
Figure 3: Model of an eyeball.
3.1 Movement of eyeball
The visual axis at the primary position can be expressed as x = A0 + ta, and the new visual axis is expressed as x = A + tc. If the eye moves, the visual axis rotates around E0 by the rotation matrix, R, and E0 is translated to E by the translation matrix, T . The relation can be written as
E + ((A E) + tc) = T E0 + R((A0 E0) + ta) (15)
Since E = T E0 and (A E) = R(A0 E0), the relation between a and c can be written as
c = Ra.
(16)
Therefore, in order to analyze the movement of the visual axis, we only have to analyze the movement of the direction vector of the visual axis.
96
3.2 Listings law
Listings law states that any eye position can be reached by a single rotation from the primary position and any rotation axis lies in a plane (Listings plane) [Zatsiorsky 1999]. In this paper, the rotation axis of the eyeball, l, is assumed to be parallel to the xy plane. We will calculate the eyeball rotation based on Listings law.
3.3 Visual axis
D is assumed to be a center on the display. When a user is gazing at D, the vector of the visual axis can be written as D A. Figure 4 shows the relation among a, (D A)/|D A|, and l. The rotation from a to (D A)/|D A| by ψ can be written as a product of three rotation matrices as
DA |D A|
=
R(z, φ)R(x, ψ)R(z, −φ)a
(17)
= Ma
(18)
where R(z, φ) is the rotation matrix around the z-axis by φ, R(x, ψ) is the rotation matrix around the x-axis by ψ, R(z, −φ) is the rotation matrix around the z-axis by −φ, and M is a combined matrix of all the matrices.
The unit vector of the axis of rotation, l, and the angle, ψ, can be written as
l
=
a × (D A) |a × (Dµ A)|
(19)
ψ
=
arccos
a · (D A) |a||D A|
.
(20)
Since φ is made by l and (1, 0, 0)T (= e), it can be obtained
as follows:
µ¶
φ = arccos
e·l |e||l|
.
(21)
=
(R(z, φ)R(x, ψ)R(z, −φ))1
B |B
A A|
=
R(z,
φ)R(x,
−ψ)R(z,
−φ)
B |B
A A|
.
(24)
4 Estimation of visual axis
In this section, a method to estimate the visual axis using b, which was determined by calibration, is described.
4.1 Calculation of rotation axis and angle
d is a unit vector of the optic axis and it is given as d = (B A)/|B A|. l0 is a unit direction vector of the rotation axis of the eye, which is parallel to xy plane. Figure 5 shows the relation among b, d, and l0. l0 is included in the xy plane a · x = 0 in the figure. Since b rotates to d around l0, l0 is also included in the plane (d b) · x = 0. Therefore the unit direction vector of the axis is given as
l0 = a × (d b) .
(25)
|a × (d b)|
The rotation axis can be expressed as x = tl0. The intersection point between x = tl0 and the plane perpendicular to x = tl0 including (d b) must be determined in order to estimate ψ. t can be determined by the inner product of l0 and b, which is given as t = l0 · b. Therefore the center of rotation of b is (l0 · b)l0. The rotation angle of the eyeball is estimated as
µ
ψ = arccos
((l0 · b)l0 b) · ((l0 · b)l0 d) |((l0 · b)l0 b)||((l0 · b)l0 d)|
.
(26)
Figure 4: Rotation about the visual axis.
3.4 Estimation of visual axis at primary position
If b is the optic axis at the primary position, then the relation between (B A) and b can be written using M as follows:
B |B
A A|
=
M b.
(22)
b can be obtained by using φ and ψ as
b = M 1 B A
(23)
|B A|
Figure 5: Rotation about the optic axis.
4.2 Calculation of visual axis
The visual axis can be calculated by rotating a around l0 by an angle ψ. A quaternion is convenient to express the rotation of a point around a certain axis. The elements of a and l0 are written as a = (ax, ay, az) and l0 = (lx0 , ly0 , lz0 ), respectively.
a is expressed in quaternion form as Pa = (0; ax, ay, az) and the rotation is expressed as Q = (cos(ψ/2); lx0 sin(ψ/2), ly0 sin(ψ/2), lz0 sin(ψ/2)). Then, the visual axis vector, Pc, in quaternion form can be obtained as follows:
Pc = (0; cx, cy, cz) = QPaQ1.
(27)
Consequently we can obtain the vector of the visual axis, c = (cx, cy, cz).
97
4.3 Estimating gazing position on display The display can be expressed by four points: DT L, DT R, DBL, and DBR. The normal vector of the display, nd, is (DT L DBR) × (DT R DBL); threfore, the display can be expressed as nd · (x DT L) = 0. Since the visual axis is written as x = A + tc, the intersection point is the point of gazing (P OG) on the display. After estimating the position of the intersection point, it is converted to graphical coordinates on the screen.
5 Implementation and experimental results
5.1 Implementation A prototype system was implemented, as shown in figure 6. This system uses two synchronized monochrome IEEE-1394 digital cameras using a 1/3” CMOS image sensor (Firefly MV, Point Grey Research Inc.) with 50-mm lens and an IR filter, two infrared light sources attached to a 17” LCD, and a Windows-platform-based PC. Software was developed using OpenCV[Intel ]. If a user clicks on the mouse, a fixation point is displayed. When the user gazing at the fixation point releases the mouse button, the calibration is performed.
Figure 6: System.
5.2 Experimental results The prototype system was evaluated experimentally with one adult subject. The subject was asked to fixate on 25 points on the display. The area within which the user could move was 3 cm laterally, 2.5 cm vertically, and 5 cm backward/forward, and was approximately 57.5 cm from the display. Figure 7 shows the result of the experiment. It was found that the accuracy was under 1 ◦, except that around the top left and top right, the measurement had a low accuracy or was impossible to determine. This may be because the penumbra of the cornea is not like a sphere or an LED that reflects on the sclera. This may be attributed to the limitation of the two-light-source and two-camera arrangement. The next step will be the improvement of the arrangement of light sources and cameras in order to observe Purkinje images on the corneal surface near the optic axis.
6 Conclusion
This paper presented a one-point calibration gaze tracking method based on eyeball kinematics using stereo cameras. A prototype system was developed based on this method and the accuracy was under 1 ◦ around the center and bottom of the display.
Acknowledgements
This research was partially supported by the Ministry of Education, Culture, Sports, Science and Technology, Grant-
Figure 7: Experimental results: the dotted points indicate the estimates of the POG, the 25 grid points represent the intended fixation points, and the numbers represent errors in view angle (degree).
in-Aid for Young Scientists (B), 18700645, 2006.
References
Duchowski, A. T. 2007. Eye Tracking Methodology: Theory and Practice, 2nd ed. Springer-Verlag.
Guestrin, E. D., and Eizenman, M. 2006. General theory of remote gaze estimation using the pupil center and corneal reflections. IEEE Transactions on Biomedical Engineering 53, 6, 1124—1133.
Guestrin, E. D., and Eizenman, M. 2007. Remote pointof-gaze estimation with free head movements requiring a single-point calibration. In Proceedings of the 29th Annual International Conference of the IEEE EMBS, 4556—4560.
Intel. Open source computer vision library. http:// www.intel.com/technology/computing/opencv/index.htm.
Jacob, R. J. K. 1991. The use of eye movements in humancomputer interaction techniques: what you look at is what you get. ACM Transactions on Information Systems 9, 2, 152—169.
Nagamatsu, T., Kaieda, Y., Kamahara, J., and Shimada, H. 2007. Development of a skill acquisition support system using experts eye movement. In Proceedings of HCI International 2007, vol. 9, 430—439.
Ohno, T., and Mukawa, N. 2004. A free-head, simple calibration, gaze tracking system that enables gaze-based interaction. In Proceedings of the 2004 symposium on Eye tracking research & applications, 115—122.
Ohno, T. 2006. One-point calibration gaze tracking method. In Proceedings of the 2006 symposium on Eye tracking research & applications, 34—34.
Shih, S.-W., and Liu, J. 2004. A novel approach to 3-d gaze tracking using stereo cameras. IEEE Transactions on Systems, Man, and Cybernetics, Part B 34, 1, 234—245.
Villanueva, A., and Cabeza, R. 2007. Models for gaze tracking systems. EURASIP Journal on Image and Video Processing 2007, Article ID 23570.
Zatsiorsky, V. M. 1999. Kinematics of Human Motion (Japanese Edition). NAP Limited.
98

View File

@@ -0,0 +1,14 @@
Title: DVIOUT
Author: nagamatu
Creator: PScript5.dll Version 5.2
Producer: Acrobat Distiller 8.1.0 (Windows); modified using iText 4.2.0 by 1T3XT
CreationDate: 01/08/08 17:07:44
ModDate: 03/05/25 04:51:25
Tagged: no
Form: none
Pages: 4
Encrypted: no
Page size: 612 x 792 pts (letter) (rotated 0 degrees)
File size: 392477 bytes
Optimized: no
PDF version: 1.6

View File

@@ -0,0 +1,519 @@
LETTERSTONATURE _NA~TU_R_E_V_O_L_.3_2_3_9_0_CT_O_B_E_R_1_98_6_ _ _ _ _ _ _ _ _
------------------=533
delineating the absolute indigeneity of amino acids in fossils. As AMS techniques are refined to handle smaller samples, it may also become possible to date individual amino acid enantiomers by the 14C method. If one enantiomer is entirely derived from the other by racemization during diagenesis, the individual D- and L-enantiomers for a given amino acid should have identical 14C ages.
Older, more poorly preserved fossils may not always prove amenable to the determination of amino acid indigeneity by the stable isotope method, as the prospects for complete replacement of indigenous amino acids with non-indigenous amino acids increases with time. As non-indigenous amino acids undergo racemization, the enantiomers may have identical isotopic compositions and still not be related to the original organisms. Such a circumstance may, however, become easier to recognize as more information becomes available concerning the distribution and stable isotopic composition of the amino acid constituents of modern representatives of fossil organisms. Also, AMS dates on individual amino acid enantiomers may, in some cases, help to clarify indigeneity problems, in particular when stratigraphic controls can be used to estimate a general age range for the fossil in question.
Finally, the development of techniques for determining the stable isotopic composition of amino acid enantiomers may enable us to establish whether non-racemic amino acids in some carbonaceous meteorites27 are indigenous, or result in part from terrestrial contamination.
M.H.E. thanks the NSF, Division of Earth Sciences (grant EAR-8352055) and the following contributors to his Presidential Young Investigator Award for partial support of this research:
Arco, Exxon, Phillips Petroleum, Texaco Inc., The Upjohn Co. We also acknowledge the donors of the Petroleum Research Fund, administered by the American Chemical Society (grant 16144-AC2 to M.H.E., grant 14805-AC2 to S.A.M.) for support. S.A.M. acknowledges NSERC (grant A2644) for partial support.
Received 19 May; accepted 15 July 1986.
I. Bada, J. L. & Protsch, R. Proc. natn. Acad. Set U.S.A 70, 1331-1334 (1973). 2. Bada, J. L., Schroeder, R. A. & Carter, G. F. Science 184, 791-793 (1974). 3. Boulton, G. S. et al. Nature 298, 437-441 (1982). 4. Wehmiller, J. F. in Quaternary Dating Methods (ed. Mahaney, W. C.) 171-193 (Elsevier,
Amsterdam, 1984). 5. Engel, M. H., Zumberge, J.E. & Nagy. B. Analyt. Biochem. 82, 415-422 (1977). 6. Bada, J. L. A Rev. Earth planet. Sci. 13, 241-268 (1985). 7. Chisholm, B. S., Nelson, D. E. & Schwarcz, H.P. Science 216, 1131-1132 (1982). 8. Ambrose, S. H. & DeNiro, M. J. Nature 319, 321-324 (1986). 9. Macko, S. A., Estep, M. L. F., Hare, P. E. & Haering, T. C. Yb. Carnegie lnstn Wash. 82,
404-410 (1983). 10. Hare, P. E. & Estep, M. L. F. Yb. Carnegie Instn Wash. 82, 410-414 (1983). 11. Engel, M. H. & Hare, P. E. in Chemistry and Biochemistry of the Ami,,o Acids (ed. Barrett,
G. C.) 462-479 (Chapman and Hall, London, 1985). 12. Johnstone, R. A. W. & Rose, M. E. in Chemistry and Biochemistry of the Amino Acids (ed.
Barrett, G. C.) 480-524 (Chapman and Hall, London, 1985). 13. Weinstein, S., Engel, M. H. & Hare, P. E. in Practical Protein Chemistry-A Handbook (ed.
Darbre, A.) 337-344 (Wiley, New York, 1986). 14. Bada, J. L., Gillespie, R., Gowlett, J. A. J. & Hedges, R. E. M. Nature 312, 442-444 (1984). 15. Mitterer, R. M. & Kriausakul, N. Org. Geochem. 7, 91-98 (1984). 16. Williams, K. M. & Smith, G. G. Origins Life 8, 91-144 (1977). 17. Engel, M. H. & Hare, P. E. Yb. Carnegie Instn Wash. 81, 425-430 (1982). 18. Hare, P. E. Yb. Carnegie Instn Wash. 73, 576-581 (1974). 19. Pillinger, C. T. Nature 296, 802 (1982). 20. Neuberger, A. Adv. Protein Chem. 4, 298-383 (1948). 21. Engel, M. H. & Macko, S. A. Analyt. Chem. 56, 2598-2600 (1984). 22. Dungworth, G. Chem. Geo/. 17, 135.-153 (1976). 23. Weinstein, S., Engel, M. H. & Hare, P. E. Analyt. Biochem. 121, 370-377 (1982). 24. Macko, S. A., Lee, W. Y. & Parker, P. L. J. exp. mar. Biol. Ecol 63, 145-149 (1982). 25. Macko, S. A., Estep, M. L. F. & Haering, T. C. Yb. Carnegie Instn Wash. 81, 413-417 (1982). 26. Vallentync, J. R. Geochim. cosmochim. Acta 28, 157-188 (1964). 27. Engel, M. H. & Nagy, B. Nature 296, 837-840 (1982).
Learning representations by back-propagating errors
David E. Rumelhart*, Geoffrey E. Hintont & Ronald J. Williams*
* Institute for Cognitive Science, C-015, University of California, San Diego, La Jolla, California 92093, USA
t Department of Computer Science, Carnegie-Mellon University,
Pittsburgh, Philadelphia 15213, USA
We describe a new learning procedure, back-propagation, for
networks of neurone-like units. The procedure repeatedly adjusts
the weights of the connections in the network so as to minimize a
measure of the difference between the actual output vector of the
net and the desired output vector. As a result of the weight
adjustments, internal 'hidden' units which are not part of the input
or output come to represent important features of the task domain,
and the regularities in the task are captured by the interactions
of these units. The ability to create useful new features distin-
guishes back-propagation from earlier, simpler methods such as the perceptron-convergence procedure1•
There have been many attempts to design self-organizing
neural networks. The aim is to find a powerful synaptic
modification rule that will allow an arbitrarily connected neural
network to develop an internal structure that is appropriate for
a particular task domain. The task is specified by giving the
desired state vector of the output units for each state vector of
the input units. If the input units are directly connected to the
output units it is relatively easy to find learning rules that
iteratively adjust the relative strengths of the connections so as
to progressively reduce the difference between the actual and
desired
output
vectors
2 •
Learning
becomes
more
interesting
but
t To whom correspondence should be addressed.
more difficult when we introduce hidden units whose actual or desired states are not specified by the task. (In perceptrons, there are 'feature analysers' between the input and output that are not true hidden units because their input connections are fixed by hand, so their states are completely determined by the input vector: they do not learn representations.) The learning procedure must decide under what circumstances the hidden units should be active in order to help achieve the desired input-output behaviour. This amounts to deciding what these units should represent. We demonstrate that a general purpose and relatively simple procedure is powerful enough to construct appropriate internal representations.
The simplest form of the learning procedure is for layered networks which have a layer of input units at the bottom; any number of intermediate layers; and a layer of output units at the top. Connections within a layer or from higher to lower layers are forbidden, but connections can skip intermediate layers. An input vector is presented to the network by setting the states of the input units. Then the states of the units in each layer are determined by applying equations (1) and (2) to the connections coming from lower layers. All units within a layer have their states set in parallel, but different layers have their states set sequentially, starting at the bottom and working upwards until the states of the output units are determined.
The total input, xi, to unitj is a linear function of the outputs, Yi, of the units that are connected to j and of the weights, wii• on these connections
(1)
Units can be given biases by introducing an extra input to each unit which always has a value of 1. The weight on this extra input is called the bias and is equivalent to a threshold of the opposite sign. It can be treated just like the other weights.
A unit has a real-valued output, Yi, which is a non-linear function of its total input
(2)
© 1986 Nature Publishing Group
LETTERS ~ 5 3 4 ~ - - - - - - - - - - - - - - - - -
TO NATURE--------_:_:N::.:ATU=R=E-'Y'---=0-=L:....:.3=23'---'--9--=-OCT-=-=-=0-=B=ER"-'-'19--'-"86
~-------.::..8·:_::8_~ f>.4 ""----=-8.:.::.8~------,
Output unit
14.2 -3.f>
-14.2
3.f>
7.2
-7.1
-7.2
7.1
3.f>
-3.f>
-14.2
14.2
Input units
Fig. 1 A network that has learned to detect mirror symmetry in the input vector. The numbers on the arcs are weights and the numbers inside the nodes are biases. The learning required 1,425 sweeps through the set of 64 possible input vectors, with the weights being adjusted on the basis of the accumulated gradient after each sweep. The values of the parameters in equation (9) were e = 0.1 and a= 0.9. The initial weights were random and were uniformly distributed between -0.3 and 0.3. The key property of this solution is that for a given hidden unit, weights that are symmetric about the middle of the input vector are equal in magnitude and opposite in sign. So if a symmetrical pattern is presented, both hidden units will receive a net input of 0 from the input units, and, because the hidden units have a negative bias, both will be off. In this case the output unit, having a positive bias, will be on. Note that the weights on each side of the midpoint are in the ratio 1: 2: 4. This ensures that each of the eight patterns that can occur above the midpoint sends a unique activation sum to each hidden unit, so the only pattern below the midpoint that can exactly balance this sum is the symmetrical one. For all non-symmetrical patterns, both hidden units will receive non-zero activations from the input units. The two hidden units have identical patterns of weights but with opposite signs, so for every non-symmetric pattern one hidden unit
will come on and suppress the output unit.
It is not necessary to use exactly the functions given in equations (1) and (2). Any input-output function which has a bounded derivative will do. However, the use of a linear function for combining the inputs to a unit before applying the nonlinearity greatly simplifies the learning procedure.
The aim is to find a set of weights that ensure that for each input vector the output vector produced by the network is the same as (or sufficiently close to) the desired output vector. If there is a fixed, finite set of input-output cases, the total error in the performance ofthe network with a particular set of weights can be computed by comparing the actual and desired output vectors for every case. The total error, E, is defined as
E =½2: L (Yj,c-dj,c)2
(3)
C j
where c is an index over cases (input-output pairs), j is an index over output units, y is the actual state of an output unit and d is its desired state. To minimize E by gradient descent it is necessary to compute the partial derivative of E with respect to each weight in the network. This is simply the sum of the partial derivatives for each of the input-output cases, For a given case, the partial derivatives of the error with respect to each weight are computed in two passes. We have already described the forward pass in which the units in each layer have their states determined by the input they receive from units in lower layers using equations (1) and (2). The backward pass which propagates derivatives from the top layer back to the bottom one is more complicated.
Christopher = Penelope
Andrew = Christine I
Margaret = Arthur
Victoria = James I
Jennifer = Charles
Colin
Charlotte
Roberto = Maria
Pierro= Francesca
Gina= Emilio
Lucia = Marco
I
I
I
Alfonso
Sophia
Angela= Tomaso
Fig. 2 Two isomorphic family trees. The information can be expressed as a set of triples of the form (person !)(relationship) (person 2), where the possible relationships are {father, mother, husband, wife, son, daughter, uncle, aunt, brother, sister, nephew, niece}. A layered net can be said to 'know' these triples if it can produce the third term of each triple when given the first two. The first two terms are encoded by activating two of the input units, and the network must then complete the proposition by activating
the output unit that represents the third term.
Fig. 3 Activity levels in a five-layer network after it has learned. The bottom layer has 24 input units on the left for representing (person 1) and 12 input units on the right for representing the relationship. The white squares inside these two groups show the activity levels of the units. There is one active unit in the first group representing Colin and one in the second group representing the relationship 'has-aunt'. Each of the two input groups is totally connected to its own group of 6 units in the second layer. These groups learn to encode people and relationships as distributed patterns of activity. The second layer is totally connected to the central layer of 12 units, and these are connected to the penultimate layer of 6 units. The activity in the penultimate layer must activate the correct output units, each of which stands for a particular (person 2). In this case, there are two correct answers (marked by black dots) because Colin has two aunts. Both the input units and the output units are laid out spatially with the English people in
one row and the isomorphic Italians immediately below.
The backward pass starts by computing aE/ay for each of
the output units. Differentiating equation (3) for a particular case, c, and suppressing the index c gives
aE/ay,=y,-d,
(4)
We can then apply the chain rule to compute aE/ax,
aE/axi =aE/ayi·dy/dxi
Differentiating equation (2) to get the value of dyi/ dx, and
substituting gives
(5)
This means that we know how a change in the total input x to an output unit will affect the error. But this total input is just a linear function of the states of the lower level units and it is also a linear function of the weights on the connections, so it is easy to compute how the error will be affected by changing these states and weights. For a weight w,;, from i to j the derivative is
aE/aw,; =aE/ax,·axi/aw,;
=iJE/ax,· Y;
(6)
and for the output of the i'h unit the contribution to aE/ay;
© 1986 Nature Publishing Group
Nc..cAc.ccTUc.c..ccR=E_V..ccO-=-L---'.3=2c....3- - ' - - - 9 - - - " ' 0 - - - " - C T - - ' - - - O " - - ' B = E = R ' - - - ' 1 " - - - 9 8 ' - - " 6 - - - - - - - - - L E T T E R S T O N A T U R E - - - - - - - - - - - - - - - - - -5-3_5
resulting from the effect of i on j is simply
.··· • = - · - • - • °- a_--;~" • -:__E': •
~· - _-.-. • =- -= • -:;:__~a -;, ~-==
aE/avaxjjayj =a-E/axj· wji
so taking into account all the connections emanating from unit i we have
(7)
•= ~=- --• -~_;:_•_-= =-'-~ --~_=--.=-
3
----_0- -~ _:: :..:- •
:..=.. ::.• • - • •
Fig. 4 The weights from the 24 input units that represent people to the 6 units in the second layer that learn distributed representations of people. White rectangles, excitatory weights; black rectangles, inhibitory weights; area of the rectangle encodes the magnitude of the weight. The weights from the 12 English people are in the top row of each unit. Unit 1 is primarily concerned with the distinction between English and Italian and most of the other units ignore this distinction. This means that the representation of an English person is very similar to the representation of their Italian equivalent. The network is making use of the isomorphism between the two family trees to allow it to share structure and it will therefore tend to generalize sensibly from one tree to the other. Unit 2 encodes which generation a person belongs to, and unit 6 encodes which branch of the family they come from. The features captured by the hidden units are not at all explicit in the input and output encodings, since these use a separate unit for each person. Because the hidden features capture the underlying structure of the task domain, the network generalizes correctly to the four triples on which it was not trained. We trained the network for 1500 sweeps,
using e = 0.005 and a = 0.5 for the first 20 sweeps and E = 0.01 and a = 0.9 for the remaining sweeps. To make it easier to interpret
the weights we introduced 'weight-decay' by decrementing every weight by 0.2% after each weight change. After prolonged learning,
the decay was balanced by aE Iaw, so the final magnitude of each
weight indicates its usefulness in reducing the error. To prevent the network needing large weights to drive the outputs to 1 or 0, the error was considered to be zero if output units that should be on had activities above 0.8 and output units that should be off had
activities below 0.2.
A set of corresponding
weights
Fig. S A synchronous iterative net that is run for three iterations and the equivalent layered net. Each time-step in the recurrent net corresponds to a layer in the layered net. The learning procedure for layered nets can be mapped into a learning procedure for iterative nets. Two complications arise in performing this mapping: first, in a layered net the output levels of the units in the intermediate layers during the forward pass are required for performing the backward pass (see equations (5) and (6)). So in an iterative net it is necessary to store the history of output states of each unit. Second, for a layered net to be equivalent to an iterative net, corresponding weights between different layers must have the same
value. To preserve this property, we average i!E/aw for all the
weights in each set of corresponding weights and then change each weight in the set by an amount proportional to this average gradient. With these two provisos, the learning procedure can be applied directly to iterative nets. These nets can then either learn to perform
iterative searches or learn sequential structures4•
We have now seen how to compute aE/ay for any unit in the penultimate layer when given aE/ay for all units in the last layer. We can therefore repeat this procedure to compute this term for successively earlier layers, computing aE / aw for the weights as we go.
One way of using aE / aw is to change the weights after every input-output case. This has the advantage that no separate memory is required for the derivatives. An alternative scheme, which we used in the research reported here, is to accumulate aE/aw over all the input-output cases before changing the weights. The simplest version of gradient descent is to change each weight by an amount proportional to the accumulated
aE/aw
A.w=-eaE/aw
(8)
This method does not converge as rapidly as methods which make use of the second derivatives, but it is much simpler and can easily be implemented by local computations in parallel hardware. It can be significantly improved, without sacrificing the simplicity and locality, by using an acceleration method in which the current gradient is used to modify the velocity of the point in weight space instead of its position
A. w(t) = -eaEjaw(t) + aA.w(t -1)
(9)
where t is incremented by 1 for each sweep through the whole set of input-output cases, and a is an exponential decay factor between Oand 1 that determines the relative contribution of the current gradient and earlier gradients to the weight change.
To break symmetry we start with small random weights. Variants on the learning procedure have been discovered independently by David Parker (personal communication) and by Yann Le Cun3.
One simple task that cannot be done by just connecting the input units to the output units is the detection of symmetry. To detect whether the binary activity levels of a one-dimensional array of input units are symmetrical about the centre point, it is essential to use an intermediate layer because the activity in an individual input unit, considered alone, provides no evidence about the symmetry or non-symmetry of the whole input vector, so simply adding up the evidence from the individual input units is insufficient. (A more formal proof that intermediate units are required is given in ref. 2.) The learning procedure discovered an elegant solution using just two intermediate units, as shown in Fig. 1.
Another interesting task is to store the information in the two family trees (Fig. 2). Figure 3 shows the network we used, and Fig. 4 shows the 'receptive fields' of some of the hidden units after the network was trained on 100 of the 104 possible triples.
So far, we have only dealt with layered, feed-forward networks. The equivalence between layered networks and recurrent networks that are run iteratively is shown in Fig. 5.
The most obvious drawback of the learning procedure is that the error-surface may contain local minima so that gradient descent is not guaranteed to find a global minimum. However, experience with many tasks shows that the network very rarely gets stuck in poor local minima that are significantly worse than the global minimum. We have only encountered this undesirable behaviour in networks that have just enough connections to perform the task. Adding a few more connections creates extra dimensions in weight-space and these dimensions provide paths around the barriers that create poor local minima in the lower dimensional subspaces.
© 1986 Nature Publishing Group
LETTERSTO NATURE _________ _S3_ 6 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
N_ATU_R_E_V_O_L._3_23_9_0C_T_O_B_ER_19_86
The learning procedure, in its current form, is not a plausible model of learning in brains. However, applying the procedure to various tasks shows that interesting internal representations can be constructed by gradient descent in weight-space, and this suggests that it is worth looking for more biologically plausible ways of doing gradient descent in neural networks.
We thank the System Development Foundation and the Office of Naval Research for financial support.
Received I May; accepted 31 July 1986.
1. Rosenblatt, F. Principles of Neurodynamics {Spartan, Washington, DC, 1961). 2. Minsky. M. L. & Papert, S. Perceptrons (MIT, Cambridge, 1969). 3. Le Cun, Y. Proc. Cognitiva 85, 599-604 (1985). 4. Rumelhart, D. E., Hinton, G. E. & Williams, R. J. in Parallel Distributed Processing:
Explorations in the Microstructure ofCognition. Vol. 1: Foundations (eds Rumclhart, D. E. & McClelland, J. L.) 318-362 (MIT, Cambridge, 1986).
6
C155
5
4
0
3
00
00
0
0
"·.... :, OOC9
:
• ••
• •
''Oo- - - -
0 10 20 30 40 50 60 70
C164
Bilateral amblyopia after a short period of reverse occlusion in kittens
Kathryn M. Murphy* & Donald E. Mitchell
Department of Psychology, Dalhousie University, Halifax Nova Scotia, Canada B3H 4Jl
The majority of neurones in the visual cortex of both adult cats
and kittens can be excited by visual stimulation of either eye.
Nevertheless, if one eye is deprived of patterned vision early in
life, most cortical cells can only be activated by visual stimuli
presented to the nondeprived eye and behaviourally the deprived eye is apparently useless1'2, Although the consequences of
monocular deprivation can be severe, they can in many circum-
stances be rapidly reversed with the early implementation of reverse occlusion which forces the use of the initially deprived eye3•4•
However, by itself reverse occlusion does not restore a normal distribution of cortical occular dominance3 and only promotes visual recovery in one eye5•6• In an effort to find a procedure that
might restore good binocular vision, we have examined the effects
on acuity and cortical ocular dominance of a short, but physiologi-
cally optimal period of reverse occlusion, followed by a period of
binocular vision beginning at 7.5 weeks of age. Surprisingly, despite
the early introduction of binocular vision, both eyes attained
acuities that were only approximately 1/3 of normal acuity levels.
Despite the severe bilateral amblyopia, cortical ocular dominance
appeared similar to that of normal cats. This is the first demonstra-
tion of severe bilateral amblyopia following consecutive periods
of monocular occlusion.
Nine kittens were used, of which eight were monocularly
deprived by eyelid suture from about the time of natural eye
opening (6 to 11 days) until 5 weeks of age, at which time the
initially deprived eye was opened and the other eye was sutured
closed for 18 days. Physiological recordings from area 17 were
made from one normal control and from five monocularly-
deprived kittens, one immediately after reverse occlusion (as a
control); the remaining four after a further 4 weeks at least
(range 4-8 weeks) of normal binocular vision. Grating acuity
thresholds were determined for both eyes of a further three
kittens (subjected to the same regime-monocular deprivation,
18 days reverse suturing, followed by normal binocular vision)
by
use
of
a
jumping
s
ta
n
d
5
7 •
None
of
the
kittens
tested
behaviourally were examined physiologically. Single unit
recordings were made in area 17 of the anaesthetized, paralysed
kittens (one normal, five experimental) with glass coated
platinum-iridium electrodes. Anaesthesia was induced by
• Present address: School of Optometry, Univen1ity of California, Berkeley, California 94720, USA.
4
3
0
2
0
'0 1()- - - - - - - - - - - - - - - - - - - - - - - - ._ ________ _
0 10 20 30 40 50 60 70
Days since termination of reverse occlusion
Fig. 1. Changes in visual acuity during the period of binocular
vision for two kittens (C155 and Cl64) that were previously monocularly deprived until 5 weeks of age, and then reverse
occluded for 18 days. e, Acuity of the initially deprived eye; 0,
acuity of the initially nondeprived eye.
intravenous pentothal and maintained by artificial respiration
with 70% N20 and 30% 0 2 supplemented with intravenous Nembutal; EEG, EKG, body temperature, and expired CO2
levels were monitored.The eyes were brought to focus on a
tangent screen 137 cm distant from the kitten using contact
lenses with 3 mm artificial pupils. Single units were recorded
along one long penetration in area 17 down the medial bank of
the postlateral gyrus in each hemisphere, always beginning in
the hemisphere contralateral to the initially open eye. Receptive
fields
were
sampled
according
to
established
procedure
s
8 ,
every
100 µm along the penetration in a cortical region corresponding
to the horizontal meridian of visual space. All units were located
within 15° of the area centralis, with the majority within 5°.
The longitudinal changes in visual acuity of both eyes follow-
ing introduction of binocular vision are shown in Fig. 1 for two
representative kittens. At the end of 18 days of reverse occlusion
the vision of the initially deprived eye had recovered to only
rudimentary levels (1-2.5 cycles per degree) while at the same
time the initially nondeprived eye had been rendered blind.
During the subsequent period of binocular visual exposure the
vision of both eyes improved slightly, but only to a very limited
extent (to between 1.7 and 3.4 cycles per degree). The results
from the third animal were very similar. After more than 2
months of binocular exposure the acuities of the initially
deprived and nondeprived eyes were respectively, 2.54 and 3.35
cycles per degree. Surprisingly, after 2 months of binocular
vision, the acuity of both eyes of these animals remained at
about one-third to one-half of normal levels6• Although the
initially deprived eye was opened at the peak of the sensitive
period (5 weeks of age) and the initially nondeprived eye was
closed for a relatively brief period of time (18 days), this depriva-
tion regimen had a devastiating and permanent effect upon the
visual acuity of both eyes.
© 1986 Nature Publishing Group

View File

@@ -0,0 +1,13 @@
Title: 6088 - V323.indd
Creator: Adobe InDesign 2.0
Producer: Acrobat Distiller 11.0.9(Windows)
CreationDate: 05/31/17 12:23:50
ModDate: 05/31/17 12:23:50
Tagged: yes
Form: none
Pages: 4
Encrypted: no
Page size: 581.102 x 793.701 pts (rotated 0 degrees)
File size: 1376827 bytes
Optimized: yes
PDF version: 1.4

View File

@@ -0,0 +1,407 @@
See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/222500586
Computer interface evaluation using eye movements: Methods and constructs
Article in International Journal of Industrial Ergonomics · October 1999
DOI: 10.1016/S0169-8141(98)00068-7
CITATIONS
1,018
2 authors: Joseph H. Goldberg Meta 78 PUBLICATIONS 5,028 CITATIONS
SEE PROFILE
READS
7,691
Xerxes P Kotval Nokia Bell-Labs 13 PUBLICATIONS 1,242 CITATIONS
SEE PROFILE
All content following this page was uploaded by Joseph H. Goldberg on 12 October 2017.
The user has requested enhancement of the downloaded file.
International Journal of Industrial Ergonomics 24 (1999) 631}645
Computer interface evaluation using eye movements: methods and constructs
Joseph H. Goldberg*, Xerxes P. Kotval1
The Pennsylvania State University, Department of Industrial and Manufacturing Engineering, 207 Hammond Building, University Park, PA 16802-1401, USA
Received 1 March 1998; received in revised form 13 March 1998; accepted 7 July 1998
Abstract
Eye movement-based analysis can enhance traditional performance, protocol, and walk-through evaluations of computer interfaces. Despite a substantial history of eye movement data collection in tasks, there is still a great need for an organized de"nition and evaluation of appropriate measures. Several measures based upon eye movement locations and scanpaths were evaluated here, to assess their validity for assessment of interface quality. Good and poor interfaces for a drawing tool selection program were developed by manipulating the grouping of tool icons. These were subsequently evaluated by a collection of 50 interface designers and typical users. Twelve subjects used the interfaces while their eye movements were collected. Compared with a randomly organized set of component buttons, wellorganized functional grouping resulted in shorter scanpaths, covering smaller areas. The poorer interface resulted in more, but similar duration, "xations than the better interface. Whereas the poor interface produced less e$cient search behavior, the layout of component representations did not in#uence their interpretability. Overall, data obtained from eye movements can signi"cantly enhance the observation of users' strategies while using computer interfaces, which can subsequently improve the precision of computer interface evaluations. Relevance to industry
The software development industry requires improved methods for the objective analysis and design of software interfaces. This study provides a foundation for using eye movement analysis as part of an objective evaluation tool for many phases of interface analysis. The present approach is instructional in its de"nition of eye movement-based measures, and is evaluative with respect to the utility of these measures. 1998 Elsevier Science B.V. All rights reserved.
Keywords: Eye movements; HCI; Computer interface design; Software evaluation; Fixation algorithms
1. Introduction
* Corresponding author. Present address: Lucent Technologies, Bell Laboratories, Holmdel, NY.
1.1. Interface evaluation
The software development cycle requires frequent iterations of user testing and interface
0169-8141/99/$ - see front matter 1999 Elsevier Science B.V. All rights reserved PII: S 0 1 6 9 - 8 1 4 1 ( 9 8 ) 0 0 0 6 8 - 7
632
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
modi"cation. These interface evaluations, whether at initial design or at later test and evaluation stages, should assess system functionality and the impact of the interface on the user. Earlier, design evaluation methods include cognitive walkthroughs, heuristic, review-based, and modelbased evaluations. At more mature product phases, performance-based experiments, protocol/observation, and questionnaires are frequently used as a basis for evaluation (Dix et al., 1998). Performance-based studies assess errors and time to complete speci"ed operations or scenarios (Wickens et al., 1998).
Interface evaluation and usability testing are expensive, time-intensive exercises, often done with poorly documented standards and objectives. They are frequently qualitative, with poor reliability and sensitivity. Provision of an improved tool for rapid and e!ective evaluation of graphical user interfaces was the motivating goal underlying the present work assessing eye movements as an indicator of interface usability.
1.2. Eye movements on displays
While using a computer interface, one's eye movements usually indicate one's spatial focus of attention on a display. In order to foveate informative areas in a scene, the eyes naturally "xate upon areas that are surprising, salient, or important through experience (Loftus and Mackworth, 1978). Thus, current gazepoints on a display can approximate foci of attention over a time period. When considering short time intervals, however, one's attentional focus may lead or lag the gazepoint (Just and Carpenter, 1976). By choosing long enough sampling intervals for eye movements, temporal leads/lags should be averaged out.
Applied eye movement analysis has at least a 60 yr history in performance and usability assessments of spatial displays within information acquisition contexts such as aviation, driving, X-ray search, and advertising. Buswell (1935) measured "xation densities and serial scanpaths while individuals freely viewed artwork samples, noting that eyes follow the direction of principal lines in "gures, and that more di$cult processing produced longer "xation durations. Mackworth (1976) noted that
higher display densities produced 50}100 ms longer "xation durations than lower density displays. Non-productive eye movements more than 203 from the horizontal scanning axis strongly increased as a percentage of all eye movements as the display width and density increased. Kolers et al. (1981) measured eye "xations (number, number per line, rate, duration, words per "xation) as a function of character and line spacing in a reading task. More "xations per line (and fewer "xations per word) were associated with more tightly-grouped, singled-spaced material. Fewer, yet longer "xations were made with smaller, more densely packed text characters. Yamamoto and Kuto (1992) found improved Japanese character reading performance associated with series of sequential rather than backtracking eye movements. Eye tracking has aided the assessment of whether the order of product versus "ller displays in a television commercial in#uences one's attention to that product (Janiszewski and Warlop, 1993). Using eye movement analyses while scanning advertisements on telephone yellow pages, quarter-page ad displays were much more noticed than text listings, and color ads were perceived more quickly, more often, and longer than black and white ads (Lohse, 1997).
Prior eye movement-based interface and usability characterizations have relied heavily upon cumulative "xation time and areas of interest approaches, dividing an interface into prede"ned areas. Transitions into and from these areas, as well as time spent in each area, are tallied. While these approaches can signal areas where more or less attention is spent while using a display, few investigations have considered the complex nature of scanpaths, de"ned from a series of "xations and saccades on the interface. Scanpath complexity and regularity measures are needed to approach some of the subtler interface usability issues in screen design.
1.3. Objective
Eye tracking systems are now inexpensive, reliable, and precise enough to signi"cantly enhance system evaluations. While the hardware technology is quite mature (Young and Sheena, 1975, for a general review), methods of evaluating data from eye
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
633
tracking experiments are still somewhat immature and disorganized. The objective of the present paper is to provide an introduction and framework for eye movement data analysis techniques. These eye movement measures and algorithms are presented in light of results from an experiment presenting users with both `gooda and `poora interfaces.
2. Methods
Scanpaths were collected from 12 subjects while using both `gooda and `poora software interfaces. The resulting scanpaths were characterized using a number of quantitative measures, each designed to characterize di!erent aspects of scanpath behaviors and relate to the cognitive behavior underlying visual search and information processing. A comparison of expected user search behavior using each interface with the results of scanpath measures were used to determine the relative e!ectiveness of each measure.
2.1. Interface stimuli
The good}poor distinction was based upon physical grouping of interface tool buttons. Users expect physically grouped components to be related by some common characteristic, whether physical or conceptual (Wickens and Carswell, 1995). Exploiting this, the `gooda interface grouped eleven components into three functionally related groups: editing, drawing, and text manipulation tools (Fig. 1, left panel). These functional groupings were intended to allow relatively e$cient tool search, compared with the poorly designed interface (Fig. 1, right panel) intended to cause less e$cient visual search. The `poora interface provided a randomized (i.e., not functional or conceptual) relationship within each tool group.
To verify a substantial di!erence in perceived quality, "fty typical users and thirty interface design experts rated each interface on a scale from 1 (excellent) to 5 (unacceptable). The functionally grouped interface averaged 1.35, between good and excellent, whereas the randomly grouped interface averaged 4.53, between unacceptable and poor. Thus, the two interfaces were con"rmed as substantially di!erent in design quality.
Example `gooda and `poora interfaces were programmed to provide a well-controlled and equally familiar environment for all subjects in this study. Their primary purpose was not to evaluate the usability of these particular interfaces per se; rather, they provided a means for validating the various created measures. Fig. 1 shows two of these interfaces. The interface showed a work area with a panel of tool buttons, much like a drawing package.
2.2. Apparatus and calibration
The experiment was hosted on a PC with a 13 in (33 cm) VGA monitor with mouse/windows control. A second computer, remotely activated by the host computer, controlled the eye tracking system, a DBA systems Model 626 infrared corneal re#ection system (Fig. 2). An infrared sensitive CCD video camera was positioned just below the host
Fig. 1. Interface designs. Left panel: good design; right panel: poor design.
634
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
Fig. 2. Experimental apparatus, showing eye tracker with infrared-sensitive camera lens.
computer's monitor. The camera contained an LED inline with its focal axis, generating an illuminated pupil and light glint ("rst Purkinje re#ection) on the subject's cornea. The head posture and eye location were maintained with a head/chin rest, such that the eye was 22 in (56 cm) from the screen, and level with its center. At this distance, the screen subtended 213 and 163 of horizontal and vertical visual angle, respectively. Each 65;65 pixel tool button in the interface subtended 1 in, or 2.23 of horizontal visual angle.
Video images of the pupil and Purkinje re#ection were captured at 60 Hz by the eye tracker and assigned light intensity values to each pixel in the digital image. An intensity threshold "ltered the video image until the pupil image was isolated. Eye tracker software located the center of the pupil and calculated the vector from it to the corneal light glint. A calibration procedure related this vector with Cartesian coordinates on the interface screen, providing the subject's eyegaze location, or pointof-regard (POR). The POR coordinates were collected and stored in a data"le for later processing.
Calibration used a set of 9 screen locations, and was checked with each block (33 trials) in each subject's session. The criterion for a successful calibration equated to residuals that were less than 0.5 cm (0.53 visual angle) from actual target location. In other words, the eye tracker software
estimate of target location was not more than 10 pixels away from the actual target location.
2.3. Subjects
Twelve subjects (7 female, 5 male) participated in this study. Ages ranged from 20 to 27 yr (mean 23 yr). Participants averaged 4.8 yr of experience using typical windowing software, spending an average of 15.3 h a week using software interfaces. Because corrective lenses produce additional surface re#ections which interfere with the eye tracker's identi"cation and processing of the Purkinje image, subjects performed the experiment without corrective lenses. All subjects had an uncorrected Snellen visual acuity of 20/35 or better, as determined by a Bausch and Lomb Vision Tester (Cat. 71-22-41).
2.4. Procedure and design
After adjusting the chinrest and workstation, each subject was carefully calibrated. Calibration was also repeated prior to each block. Practice, consisting of a block of 33 trials, was provided for each of the tested interfaces. Each trial on a block was initiated by the subject selecting a `Continuea button at the center of the work area with the mouse. The `Continuea button was then
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
635
Table 1 Classi"cation of eye movement and scanpath measures
immediately replaced by the name of one of the eleven randomized tool buttons (e.g., CUT) in the middle of the workspace. The eye tracker initiated its POR data collection at this time. The subject, as quickly as possible, then located the tool button from the tool menu at the left of the display, and clicked the left button on the mouse, stopping POR collection. Feedback, consisting of a statement of `correcta or `incorrecta at the position of the initial instruction location, was provided after each trial. A 1 min break was provided between each block; the total subject testing time was 40 min.
Within each of 12 subjects, the experiment presented 6 replicates of each of the 11 tool button components for each of the two interfaces presented here. The trial order was counterbalanced between subjects. A fully-crossed ANOVA for each dependent measure included Subjects (12 levels, random e!ect);Interface (2 levels, "xed e!ect); Tool Component (11 levels, "xed e!ect);6 replicates).
3. Scanpath generation
3.1. Classixcation of measures
Scanpaths are de"ned by a saccade}"xate}saccade sequence on a display. For information search tasks, the optimal scanpath is a straight line to
a desired target, with relatively short "xation duration at the target. The derived scanpath measures discussed below attempt to quantitatively measure the divergence from this optimal scanpath in several ways. The measures each provide a single quantitative value, with some requiring no knowledge of the content of the computer interface. Table 1 provides a summary of these measures, categorizing them on two dimensions. Temporal measures describe the sequential, time-based nature of a scanpath, whereas spatial emphasizes the spread and coverage of a scanpath. Furthermore, the measures may rely upon unprocessed, 60 Hz raw gazepoint samples, or may be more oriented to processed "xations and/or saccades within a scanpath. Typically, reported eye movement data has been pre-processed to form "xations and saccades, by one of many di!erent algorithms (Goldberg and Schryver, 1993). The resulting set of "xations and saccades are further processed to characterize scanpaths and their dynamic change (Goldberg and Schryver, 1995). However, some of the measures presented here can be applied to the gazepoint samples, which is computationally easier, but with less behavioral meaning.
3.2. Fixations
The eyes dart from "xation to "xation in a typical search on a display. At least three processes take
636
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
Fig. 3. Events occurring within typical "xations.
place within the 250}300 ms of a typical "xation (Viviani, 1990), as shown in Fig. 3. First, visual information is encoded, presumably to label the general scene (Loftus and Mackworth, 1978). Next the peripheral visual "eld of the current gaze is sampled, to determine subsequent informative areas. Finally, the next saccade is planned and prepared. These processes overlap, and may occur in parallel.
Gazepoints sampled at 60 Hz represent the lineof-sight at the time of sampling, and may or may not be at a location within a "xation, as sampling may have occurred during a saccade or perhaps during a blink or other artifact. Most commercial eye tracking systems include software removal of these artifacts, and some also include "xation construction algorithms. Fixation algorithms may be based on cluster and other statistical analyses, and may be locally adaptive to the amplitude of ocular jumps (Goldberg and Schryver, 1995; Ramakrishna et al., 1993; Belofsky and Lyon, 1988; Scinto and Barnette, 1986). Most algorithms develop "xation clusters by using a constrained spatial proximity determination, but temporal constraints can also be used. Latimer (1988) used temporal information related to each sample gazepoint but only to determine the cumulative "xation time after the cluster had been de"ned by spatial criteria. A "xation algorithm must produce "xations that meet certain minimum characteristics. The center of a typical "xation is within 2}33 from the observed target object (Robinson, 1979) and the minimum process-
ing duration during a "xation is 100}150 ms (Viviani, 1990).
The present study used a data position variance method (Anliker, 1976), after removing blinks and other eye movement artifacts. Fixations were initially constrained to a 33$0.53 spatial area, and had to be of at least 100 ms in duration. This corresponded to a minimum of 6 sample gazepoints per "xation (at 60 Hz), following Karsh and Breitenbach (1983), and agrees with descriptions of saccades lasting 20}100 ms (Hallet, 1986). Once a "xation was initially de"ned, its spatial diameter was computed. Subsequent gazepoint data samples falling within this diameter threshold were interactively added to the "xation. The spatial diameter threshold was then raised or lowered within a subject, following the method of Krose and Burbeck (1989), with only one "xation diameter allowed per subject. Maximum "xation diameters were varied from 23 to 43, in 0.53 increments, until de"ned "xations su$ciently "t the gazepoint data. Allowed "xation diameters were increased when too few "xations (of very short duration) were evident. Conversely, "xation durations longer than 900 ms indicated that "xation diameter should be decreased.
While these methods are useful in identifying critical areas of attentional focus on a display, information based on the temporal order of "xations is lost. When and how often a target is "xated during a scanpath provides valuable information for the evaluation of an interface. Fig. 4 illustrates
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
637
Fig. 4. Comparison of "xation algorithms. Spatial constraints (left panel); spatial plus temporal constraints (right panel).
Table 2 Algorithm for "xation clustering
Fixation cluster algorithm
Step 1 Step 2
Step 3
Place "rst node in current cluster Compute common mean location of all samples in current cluster, and the next temporally sequential sample If the new point is within 40 pixels from the common mean location include new point in the current cluster If the new point is not within 40 pixels of the common mean then the current cluster becomes an old cluster and the new point becomes the current cluster If the number of points (n) in the old cluster *6 then the cluster is classi"ed as a FIXATION of n;16.67 ms duration If n(6 then the cluster is classi"ed as SACCADE of n;16.67 ms duration GOTO Step 2 until done
Fig. 5. Fixation cluster de"nition, showing 80 pixel diameter.
de"ned "xations that were within the 2}33 range described by Robinson (1979). If the total number of samples within a cluster was less than 6, then the cluster was categorized as part of a saccade. The general "xation algorithm applied to the present data is in Table 2.
the di!erence between spatially derived "xations and "xations derived on the basis of both spatial and temporal criteria. Areas A and B are areas of high interest due to the number of gazepoint samples at each location. The left panel shows a temporal independent clustering (spatial constraint only), whereas the right panel shows a temporalsensitive clustering. Areas A and B are still shown as areas of high interest, but by keeping track of the temporal order of samples, better information about the relationship between A and B are obtained.
The present study supplemented the preceding "xation method by testing sampled gazepoints in temporal order (Latimer, 1988; Tullis, 1983). Each of the 6 or more (de"ning at least 100 ms, at 60 Hz) temporally sequential gazepoint samples had to be within 40 pixels (0.6 in or 1.33) from the centroid of the gazepoint sample, as shown in Fig. 5. This
4. Measures of search
Illustrated descriptions of each of the eye movement measures and algorithms are provided below. Results from the good versus poor interfaces and other factors are also presented here. The same hypothetical scanpath is used in all examples below, for easy comparison. All of these measures may be used for a given scanpath, with each o!ering a slightly di!erent interpretation of the data. Scanpaths may also be viewed as directed or undirected graphs, allowing additional characterizations of complexity and size from graph theory. The organized functional grouping of components in the good design was expected induce subjects to "nd components quickly, producing rapid direct search patterns. In contrast, the randomized groups of the poor design were intended to mislead subjects, causing them to stay in an incorrect grouping
638
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
or leave a correct grouping under the incorrect expectation that grouped components were related functionally. As a result the poor design was expected to produce more extensive search behavior.
4.1. Scanpath length and duration
Scanpath length is a productivity measure that can be used for baseline comparisons or de"ning an optimal visual search based on minimizing saccadic amplitudes. This may be computed independently from the actual screen layout, and may be applied to gazepoint samples or to processed "xation data. The length (in pixels) is the summation of the distances between the gazepoint samples. An example scanpath is illustrated in Fig. 6. Lengthy scanpaths indicate less e$cient scanning behavior but do not distinguish between search and information processing times. Unless scanpaths are formed from computed "xations and saccades, the scanpaths should not be used to make detailed inferences about one's attentional allocation on a display.
Scanpath duration is more related to processing complexity than to visual search e$ciency, as much more relative time is spent in "xations than in saccades. Using 60 Hz gazepoint samples, the number of samples is directly proportional to the temporal duration of each scanpath, or Scanpath Duration"n;16.67 ms, where n"number of samples in the scanpath. However, using "xations, the scanpath duration must sum "xation durations with saccade durations.
Using gazepoint samples, there was no signi"cant di!erence in the overall duration of scanpaths produced by the good and poor interfaces f(rFom the"go1o.9d0,inpt' erfa0c.0e5w).aTs h1e439avmersa(gsed"du3r6a8ti.o4n), while the poor interface produced average durations of 1543 ms (sd"566.5). The non-signi"cant duration here, possibly due to (non-signi"cant) variance di!erences, should not be interpreted along as a sign of similar interface quality. Further measure comparisons should be conducted.
Extensive search behavior produces spatially lengthy scanpaths. Two "xation-saccade scanpaths may have the same temporal duration but considerably di!erent lengths due to the di!erences in the extent of search required. Using the summated
Fig. 6. Example computations for scanpath duration (left panel) and length (right panel).
lengths (by the pythagorean theorem), scanpath
lengths were computed, as in Fig. 6. The poor
design did indeed produce longer scanpaths
((Fsd" 86" 0), 51.146%,
p(0.05), averaging 228 pixels longer than the better interface
(1978 pixels, sd"491). Note that 65 pixels, here,
was equivalent to a screen distance of 2.5 cm.
4.2. Convex hull area
Circumscribing the entire scanpath extends the length measures to consider the area covered by a scanpath. If a circle circumscribed the scanpath, small deviations in gazepoint samples would lead to dramatic changes in the area of the circumscribed circle, exaggerating actual di!erences in the scanpath area. As shown in Fig. 7, left panel, scanpaths A and B are similar, di!ering by only one excursion, but the area of the circle circumscribing scanpath B is 4 times the area of the circle circumscribing scanpath A. In contrast, scanpaths B and C are dramatically di!erent in shape and range but produce the same circumscribed circle area.
Using the area of the convex hull circumscribing the scanpath, illustrated in Fig. 7, right panel, the exaggeration can be reduced. Note that scanpath areas A and B are now more similar, and B and C are less similar than with circumscribed circles. Table 3 provides an algorithm to construct convex hulls and hull area, of which steps 1}4 are illustrated in Fig. 8. Fig. 9 provides a simple example of a scanpath convex hull area. Alternative algorithms
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
639
Fig. 7. Relative comparison of areas de"ned by circumscribed circles (left panel) and convex hulls (right panel).
for generating convex hulls are provided by Sedgewick (1990). Triangle areas (e.g., ABC) were computed from:
ABD"(P(P!AB)(P!BC)(P!CA),
where the perimeter,
P"(AB#BC#CA)/2
and
IJ"("X'!X("#">'!>(". While the convex hull area may seem to be
a more comprehensive measure of search than the scanpath length, note that long scanpaths may still reside within a small spatial area. Used in conjunction, the two measures can determine if lengthy search covered a large or a localized area on a
Table 3 Algorithm for convex hull area
Step 1 Step 2 Step 3 Step 4
Step 5 Step 6
Search all samples to identify and label the four samples with the Min x, Max y, Max x and Min y Set Min x as Vertex(1) Compute the slope of Vertex(n) with every sample in the scanpath IF +Min x(Vertex(n) Max y, OR +Min y(Vertex(n)(Max x, THEN set Vertex(n) to the sample with the largest positive slope IF +Max y(Vertex(n)(Max x, OR +Min y(Vertex(n)(Min x, THEN set Vertex(n) to the sample with the least negative slope Store Vertex(n) in a list IF Vertex(n)"Min x THEN GOTO Step 5 Increment n and GOTO Step 3 Set n"2 Compute and store the area of the triangle created by the points Vertex(1), Vertex(n) and Vertex(n#1). Increment n by 1. Repeat Step 6 until done. Sum of stored areas equals convex hull area
display. In the present experiment, the poor design
produced 11% larger (31 339 pixel, sd"14 952)
search areas than the better design (28 168 pixel,
ssdea"rch12a0re0a9,cFou pled
"6.70, with the
p(0.05). The larger longer scanpath pro-
duced by the poorer interface indicated that the
disorganized interface produced a widely distrib-
uted search pattern.
Fig. 8. Iterative example of convex hull generation algorithm.
640
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
Fig. 9. Example area generated by convex hull algorithm.
4.3. Spatial density
Coverage of an interface due to search and pro-
cessing may be captured by the spatial distribution
of gazepoint samples. Evenly spread samples
throughout the display indicate extensive search
with an ine$cient path, whereas targeted samples
in a small area re#ect direct and e$cient search.
The interface can be divided into grid areas either
representing speci"c objects or physical screen
area. In the present experiment, the display was
divided into an evenly spaced 10;10 grid, with
each cell covering 64;48 pixels. The spatial density
index was equal to the number of cells containing at
least one sample, divided by the total number of
grid cells (100); an example is shown in Fig. 10.
A smaller spatial density indicated more directed
search, regardless of the temporal gazepoint samp-
ling order. The poor interface produced 7% larger
spatial density indices (Mean index"10.2, sd"
3.1%) compared with the better interface (Mean
index"9.5%, p(0.05).
sd"2.3%,
F "6.31,
4.4. Transition matrix
A transition matrix expresses the frequency of eye movement transitions between de"nes Areas of Interest (AOI's) (Ponsoda et al., 1995). This metric considers both search area and movement over time. While the scanpath spatial density provides useful information about the physical range of
Fig. 10. Example spatial density computation.
Fig. 11. Relative scanpath di!erences between e$cient (A) and ine$cient (B) search.
search, a transition matrix adds the temporal component of search. Also known as link analysis (Jones et al., 1949), frequent transitions from one region of a display to another indicates ine$cient scanning with extensive search. Consider the two simple spatial distributions presented in Fig. 11. Both distributions produce the same index of spatial density and convex hull area. However, the search behaviors are dramatically di!erent. Scanpath A has a more e$cient search pattern with a shorter scanpath length than scanpath B.
The transition matrix is a tabular representation of the number of transitions to and from each de"ned area. As shown in Fig. 12, a directed scanpath from region 3 to region 5 forms a unique cell
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
641
Fig. 12. Example development of transition matrix from areas of interest on display.
pattern in the transition matrix. An unusually dense transition matrix, with most cells "lled with at least one transition, indicates extensive search on a display, suggesting poor design. A sparse matrix indicates more e$cient and directed search. The matrix may be characterized with a single quantitative value by dividing the number of active transition cells (i.e., those containing at least one transition) by the total number of cells. A large index value indicates a dispersed, lengthy, and wandering scanpath, whereas smaller values point to more directed and e$cient search.
The de"ned AOI's may be of equal or unequal size. A content-dependent analysis would assign
each AOI to a screen window or object, with a unique AOI expressing all non-interesting areas. A content-independent analysis would simply divide the display into a grid, assigning an AOI to each grid cell. The present experiment divided the display interface into 25 regions; 24 were of equal size, whereas the 25th was the larger workspace area. In order to better capture dynamic search activity within the scanpath, intra-cell transitions need not be included (these were not shown in Fig. 12). The transition matrix density is the number of non-zero matrix cells divided by total number of cells (25;25 cells here). The poor interface had denser (1.69%, sd"1.01) transition
642
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
matrices than the better interface (1.37%, sd"0.65,
Fun direc" ted6.a9n1,d
p(0.05), consistent with the more extensive search behavior
expected due to the grouping of unrelated co-
mponents in the poorer interface design.
4.5. Number of saccades
The number of saccades in a scanpath indicates the relative organization and amount of visual search on a display, with more saccades implying greater amount of search. For applied purposes, the distance between each successive "xation can generally be de"ned as a saccade, with both an amplitude and duration. That is, the number of saccades may be de"ned from the number of "xations minus one. A minimum amplitude of 160 pixels (5.33 visual angle) was required here to "lter out small micro-saccades resulting from moving to the periphery of the prior "xation (only 80 pixels). Saccades may be quite large (e.g., 203), so no upper bound was placed on the saccadic amplitude. The overall algorithm for counting saccades thus "rst tested the distance between "xation centers, incrementing the saccade count if greater than 160 pixels. Prior to acquiring the tool button target, there were 17% more saccades (averaging 2.53 saccades, sd"1.47) produced from the poor interface than from the better interface (2.17 saccades, sd"1.16, F "8.74, p(0.05).
4.6. Saccadic amplitude
A well designed interface should provide su$cient cues to direct the user's scanning to desired targets very rapidly, with few interim "xations. This will result in an expectation of larger saccadic amplitudes. If the provided cues are not meaningful or misleading, the resultant saccades should be smaller, negotiating the interface until a meaningful cue appears. The average saccadic amplitude is computed from the sum of the distances between consecutive "xations, dividing this by the number of "xations minus one. Note that all saccades were used to generate this sum, with no minimum length criterion. There was no signi"cant di!erence in average saccadic amplitude between the two interface designs (F "0.22, p'0.05). This
indicated that even with more extensive search in poorer interfaces, the size of the saccades was similar between good (303 pixels, sd"109) and poor (299 pixels, sd"104) interfaces. While the local search step size was the same between the two interfaces, the overall extent of search was greater in the poor design. The functional grouping layout thus aids visual search planning for proper tool selection, but does not impact individual saccadic motions.
In both interfaces, subjects typically moved to one group and sampled a component. In the good interface, a rapid determination could be made to determine if the desired component was in the same group (common function) or not (di!erent function). If the group functionality matched the desired component, small saccades could be made to each component from within the tool grouping until the target was acquired. If the group does not match, a saccade is rapidly made to another group. The functional grouping layout, however, did not provide cues about the next group to sample, thus a small saccade was still made to an adjacent group, continuing the search. The poorer interface provided little or no information about the other components within a grouping, once a tool button was acquired. As a result, subjects again made small local saccades, exhaustively searching within the group before executing a saccade to an adjacent group.
Using these search measures, it was clear that the functional component grouping reduces the extent of required visual search by allowing one to rapidly `zoom ina on the desired component, while maintaining a relatively small amplitude for saccades.
5. Measures of processing
Visual search is conducted to obtain information from an interface, where more extensive search allows more interface objects to be processed. This does not consider the depth of required processing, however. In the present study, as the same representations were used in both interfaces, the depth of processing required to distinguish and interpret a component was not expected to di!er.
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
643
5.1. Number of xxations
The number of "xations is related to the number
of components that the user is required to process,
but not the depth of required processing. When
searching for a single target, a large number of
"xations indicates the user sampled many other
objects prior to selecting the target, as if distracted
or hindered from isolating the target. The poor
interface, intentionally designed to mislead the sub-
ject, produced signi"cantly more "xations than the
ignotoerdfadceesigpnro(dFuc ed" on8.3a6v,erpa( ge 0.20.55)3.
The poor "xations
(sd"1.16) for each component search, 17% more
"xations than the good interface required (aver-
age"2.17, sd"1.47). The functionally grouped
design allowed one to `zoom ina on the correct
component more e$ciently, requiring fewer com-
ponents to be processed.
5.2. Fixation duration
Longer "xations imply the user is spending more time interpreting or relating the component representations in the interface to internalized representations. Representations that require long "xations are not as meaningful to the user as those with shorter "xation durations. Maximum and average "xation times are context-independent measures, but the duration of single "xations on targets is dependent on the interface layout. Average "xation duration was calculated by summing the number of gazepoint samples in all the "xations and dividing by the number of "xations.
The level of processing for graphic representations was expected to be the same between interfaces, as the icons were the same. Con"rming this, the good interface average "xation durations (411 ms, sd"144) and the poor interface "xations (391 ms, sd"144) did not signi"cantly di!er (F " 1.92, p'0.05).
5.3. Fixation/saccade ratio
This content-independent ratio compares the time spent processing ("xations) component representations to the time spent searching (saccades) for the components. Interfaces resulting in higher
ratios indicate that there was either more processing or less search activity than interfaces with lower ratios. Other measures can determine which of these was the case. As the "xation/saccade ratio did not signi"cantly di!er between good (mean"14.8, sd"5.9) and poor (mean"13.9, sd"6.2) interfaces (F "1.26, p'0.05), if more search was required, a proportionate amount of processing was also required.
5.4. Other measures
The preceding measures only describe a portion of the potential universe of eye movement and scanpath characterization tools. Other measures may further aid interface analysis in certain circumstances. Several examples may be considered: First, a backtrack can be described by any saccadic motion that deviates more than 903 in angle from its immediately preceding saccade. These acute angles indicate rapid changes in direction, due to changes in goals and mismatch between users' expectation and the observed interface layout. Second, the ratio of on-target: all-target "xations can be de"ned by counting the number of "xations falling within a designated AOI or target, then dividing by the all "xations. This is a content-dependent e$ciency measure of search, with smaller ratios indicating lower e$ciency. Third, the number of post-target ,xations, or "xations on other areas, following target capture, can indicate the target's meaningfulness to a user. High values of non-target checking, following initial target capture indicate target representations with poor meaningfulness or visibility. Fourth, measures of scanpath regularity, considering integrated error or deviation from a regular cycle, can indicate variance in search due to a poor interface or users' state of training. Many potential measures of scanpath complexity are possible, once cyclic scanning behavior is identi"ed.
6. Discussion
Successful interaction with a computer clearly requires many elements, including good visibility, meaningfulness, transparency, and the requirement
644
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
of simple motor skills. Eye movement-based evaluation of the interface, as espoused here, can only address a subset of critical interface issues that revolve around software object visibility, meaningfulness, and placement. Though not a panacea tool for design evaluation, characterization of eye movements can help by providing easily comparable quantitative metrics for objective design iteration.
One particular strength of eye movement-based evaluation is in the assessment of users' strategies at the interface. Usually unaware of their search processes, eye movements can provide a temporal/ spatial record of search and #ow while using a computer. Strategy di!erences are most evident during lengthy (e.g., 10}15 s) tasks, where a su$cient scanpath exists for characterization by the above methods. Task that are too rapid (e.g., under 1 s) do not allow su$cient data for many of the measures.
In the case of component grouping, the present study demonstrated very good validity between users' and designers' ratings of good versus poor, and many of the eye movement-based measures. The framework and measures proposed here allow improved objective estimation of users' strategies, and the in#uence of interface design on those strategies. Compared with a randomly organized set of component buttons, well-organized functional grouping resulted in shorter scanpaths, covering smaller areas. The poorer interface resulted in less directed search with more (though equal in amplitude) saccades. Though similar in duration, the poor interface resulted in more "xations than the better interface. Whereas the poor interface produces less e$cient search behavior, the layout of component representations did not in#uence their interpretability.
Ongoing investigations will further consider the sensitivity, reliability, and validity of eye movement-based measures for interface evaluation. By presenting several designed interfaces that more continuously vary in rated quality, assessment of which measures re#ect these quality di!erences can be made. Other factors are also being introduced, such as component meaningfulness and visibility. Ultimately, eye movements may lend some degree of diagnosticity to interface evaluations, and may possibly lead to design recommendations, similar to Tullis's (1983) methodology for text-based material.
References
Anliker, J., 1976. On line measurements, analysis and control. In: Monty, R.A., Senders, J.W. (Eds.), Eye Movements and Psychological Processes. Erlbaum Press, Hillsdale, NJ.
Belofsky, M.S., Lyon, D.R., 1988. Modeling eye movement sequences using conceptual clustering techniques. Air Force Human Resources Laboratory, Doc. C AFHRL-TR-88-16, Air Force Systems, Brooks Air Force Base, TX.
Buswell, G.T., 1935. How People Look at Pictures. A Study of the Psychology of Perception in Art. The University of Chicago Press, Chicago, IL.
Dix, A., Finlay, J., Abowd, G., Beale, R., 1988. Human}Computer Interaction. 2nd ed., Prentice-Hall, London.
Goldberg, J.H., Schryver, J.C., 1993. Eye-gaze determination of user intent at the computer interface. In: Findlay, J.M., Walker, R., Kentridge, R.W. (Eds.), Eye Movement Research: Mechanisms, Processes and Applications. NorthHolland Press, Amsterdam, pp. 491}502.
Goldberg, J.H., Schryver, J.C., 1995. Eye-gaze contingent control of the computer interface: Methodology and example for zoom detection. Behavior Research Methods, Instruments and Computers 27 (3), 338}350.
Jones, R.E., Milton, J.L., Fitts, P.M., 1949. Eye "xations of aircraft pilots; IV: Frequency, duration and sequence of "xations during routine instrument #ight, US Air Force Technical Report 5975.
Just, M.A., Carpenter, P.A., 1976. Eye "xations and cognitive processes. Cognitive Psychology 8, 441}480.
Kolers, P.A., Duchnicky, R.L., Ferguson, D.C., 1981. Eye movement measurement of readability of CRT displays. Human Factors 23 (5), 517}527.
Krose, B.J.A., Burbeck, C.A., 1989. Spatial interactions in rapid pattern discrimination. Spatial Vision 4, 211}222.
Latimer, C.R., 1988. Eye-movement data: Cumulative "xation time and cluster analysis. Behavior Research Methods, Instruments, and Computers 20 (5), 437}470.
Loftus, G.R., Mackworth, N.H., 1978. Cognitive determinants of "xation location during picture viewing. Journal of Experimental Psychology: Human Perception and Performance 4 (4), 565}572.
Mackworth, N.H., 1976. Stimulus density limits the useful "eld of view. In: Monty, R.A., Senders, J.W. (Eds.), Eye Movements and Psychological Processes. Erlbaum, Hillsdale, NJ.
Ponsoda, V., Scott, D., Findlay, J.M., 1995. A probability vector and transition matrix analysis of eye movements during visual search. Acta Psycholgica 88, 167}185.
Ramakrishna, S., Pillalamarri, B., Barnette, D., Birkmire, D., Karsh, R., 1993. Cluster: A program for the identi"cation of eye-"xation-cluster characteristics. Behavior Research Methods, Instruments, and Computers 25 (1), 9}15.
Robinson, G.H., 1979. Dynamics of the eye and head during movement between displays: A qualitative and quantitative guide for designers. Human Factors 21 (3), 343}352.
Scinto, L., Barnette, B.D., 1986. An algorithm for determining clusters, pairs and singletons in eye-movement scan-path
J.H. Goldberg, X.P. Kotval / International Journal of Industrial Ergonomics 24 (1999) 631}645
645
records. Behavior Research Methods, Instruments, and Computers 18 (1), 41}44. Sedgewick, R., 1990. Algorithms in C. Addison-Wesley, Reading, MA. Tullis, T.S., 1983. The formatting of alphanumeric displays: Review and analysis. Human Factors 25 (6), 657}682. Viviani, P., 1990. In: Kowler, E. (Ed.), Eye Movements and Their Role in Visual and Cognitive Processes, Ch. 8. Elsevier Science, Amsterdam.
Wickens, C.D., Carswell, C.M., 1995. The proximity compatibility principle: Its psychological foundation and relevance to display design. Human Factors 37 (3), 473}494.
Wickens, C.D., Gordon, S.E., Liu, Y., 1998. An Introduction to Human Factors Engineering. Addison-Wesley and Longman, New York.
Yamamoto, S., Kuto, Y., 1992. A method of evaluating VDT screen layout by eye movement analysis. Ergonomics 35 (5/6), 591}606.
View publication stats

View File

@@ -0,0 +1,12 @@
Title: PII: S0169-8141(98)00068-7
Producer: Acrobat Distiller 3.02 for Power Macintosh
CreationDate: 09/09/99 09:21:30
ModDate: 09/09/99 11:09:22
Tagged: no
Form: none
Pages: 16
Encrypted: no
Page size: 504 x 671.04 pts (rotated 0 degrees)
File size: 340665 bytes
Optimized: no
PDF version: 1.2

View File

@@ -0,0 +1,268 @@
Contributions to the Physiology of Vision.--Part the First. On Some Remarkable, and Hitherto Unobserved, Phenomena of Binocular Vision Author(s): Charles Wheatstone Source: Philosophical Transactions of the Royal Society of London , 1838, Vol. 128 (1838), pp. 371-394 Published by: Royal Society Stable URL: https://www.jstor.org/stable/108203
JSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted digital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about JSTOR, please contact support@jstor.org. Your use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at https://about.jstor.org/terms
Royal Society is collaborating with JSTOR to digitize, preserve and extend access to Philosophical Transactions of the Royal Society of London
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
./?Wag. iuN5. MDl)CCCIXXVRI t]:XSe X .1o. 7/ -
:
\
3,;~ ~ /1 --\r
33JA X A1 ,-/ /_s
_C ' Cc),A-. h zgy2
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
l 371 ]
XVIII. Contri'butions to the Pkysiology of Vision.-Part the First. On some r able, and hitherto unobserved, Phenomena of Binocular Vision. By CHA WHEATSTONE, F.R.S., Professor of Experimental Philosophy in Kings C
London.
Received and Read June 21, 1838.
VHEN an object is viewed at so great a distance that the optic axes of both eyes are sensibly parallel when directed towards it, the perspective proections of it, see by each eye separately, are similar, and the appearance to the two eyes is precisely the same as when the object is seen by one eye only. There is, in such case, no difference between the visual appearance of an object in relief and its perspective projection on a plane surface; and hence pictorial representations of distant objects, when those circumstances which would prevent or disturb the illusion are carefully excluded, may be rendered such perfect resemblances of the objects they are intended to represent as to be mistaken for them;n the Diorama is an instance of this. But this similarity no longer exists when the object is placed so near the eyes that to view it the optic axes must converge; under these conditions a different perspective pro. jection of it is seen by each eye, and these perspectives are more dissimilar as the convergence of the optic axes becomes greater. This fact may be easily verified by placing any figure of three dimensions, an outline cube for instance, at a moderate distance before the eyes, and while the head is kept perfectly steady, viewing it with each eye successively while the other is closed. Plate XI. fig. 13. represents the two perspective projections of a cube; b is that seen by the right eye, and a that presented to the left eve; the figure being supposed to be placed about seven inches immediately before the spectator.
The appearances, which are by this simple experiment rendered so obvious, may be easily inferred from the established laws of perspective; for the same object in relief is, when viewed by a different eye, seen from two points of sight at a distance from each other equal to the line joining the two eyes. Yet they seem to have escaped the attention of every philosopher and artist who has treated of the subjects of vision and perspective. I can ascribe this inattention to a phenomenon leading to the important and curious consequences, which will form the subject of the present communication, only to this circumstance; that the results being contrary to a principle which was very generally maintained by optical writes, viz that objects can
3 B2
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
372 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
be seen single only when their images fall on corresponding points of the two retine, an hypothesis which will be hereafter discussed, if the consideration ever arose in their minds, it was hastily discarded under the conviction, that if the pictures presented. to the two eyes are under certain circumstances dissimilar, their differences must be so small that they need not be taken into account.
It will now be obvious why it is impossible for the artist to give a faithful representation of any near solid object, that is, to produce a painting which shall not be distinguished in the mind from the object itself. When the painting and the object are seen with both eyes, in the case of the painting two similar pictures are projecte on the retinae in the case of the solid object the pictures are dissimilar; there is therefore an essential difference between the impressions on the organs of sensation in the two cases, and consequently between the perceptions formed in the mind; the painting therefore cannot be confounded with the solid object.
After looking over the works of many authors who might be expected to have made some remarks relating to this subject, I have been able to find but one, which is in the Trattato della Pittura of LEONARDO DA VINCI *. This great artist and ingenious philosopher observes, " that a painting, though conducted with the greatest art and finished to the last perfection, both with regard to its contours, its lights, its shadows and its colours, can never show a relievo equal to that of the natural objects, unless these be viewed at a distance and with a single eye. For," says lhe, " if an object C (Plate X. fig. 1.) be viewed by a single eye at A, all objects in the space behind it, included as it were in a shadow E C F cast by a candle at A, are invisible to the eye at A; but when the other eye at B is opened, part of these objects become visible to it; those only being hid from both eyes that are included, as it were, in the double shadow C D, cast by two lights at A and B, and terminated in D, the angular space E D G beyond D being always visible to both eyes. And the hidden space C D is so much the shorter, as the object C is smaller and nearer to the eyes. Thus the object C seen with both eyes becomes, as it were, transparent, according to the usual definition of a transparent thing; natmely, that which hides nothing beyond it. But this cannot happen when an object, whose breadth is bigger than that of the pupil, is viewed by a single eye. The truth of this observation is therefore evident, because a painted figure intercepts all the space behind its apparent place, so as to preclude the eyes from the sight of every part of the imaginary ground behind it."
Wlad LEONARDO DA VINCI taken, instead of a sphere, a less simple figure for the purpose of his illustration, a cube for instance, he woultd not only have observed that the object obscured from each eye a different part of the more distant field of view, but the fact would also perhaps have forced itself upon his attention, that the object itself presented a different appearance to each eye. H-e failed to do this, and no subsequent writer within my knowledge has supplied the omission; the projection of two
* See also a Treatise of Painting, p. 178. London, 1721; and Dr. SMITH'S Complete System of Optics, Vol. ii. r. 244, where the passage is quoted.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
M 1+'.1o. lv cz 70y2. ,6
P/cd. '/7ucc.7 .ID C CCXMV7I] f/ci.- te Xl. t . .
/ / ci~ ~~~~~~ ~ ~ ~ ~ 13y.]2. 6g
as~~~~~~~~~~~~~~~~~~c 7y.73.j aJiz. 6.
;1~~ ~~~~~~~~~~~~~~~~~~~~~~ X Sf )0
ci fyi.7 . a ai l'y.i6. 6
6~ ~ ~ ~~~~~~~~~~~~b fyiz. cic ii. 6
ff=XLL iyi. 6XX: c,,iA 6itX
1I, i
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MIR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 373
obviously dissimilar pictures on the two retinae when a single object is viewed, while the optic axes converge, must therefore be regarded as a new fact in the theory of
vision.
? 2.
It being thus established that the mind perceives an object of three dimensions by means of the two dissimilar pictures projected by it on the two retinae, the following question occurs: What would be the visual effect of simultaneously presenting to each eye, instead of the object itself, its projection on a plane surface as it appears to that eye? To pursue this inquiry it is necessary that means should be contrived to make the two pictures, which must necessarily occupy different places, fall on similar part of both retinae. Under the ordinary circumstances of vision the object is seen at the concourse of the optic axes, and its images consequently are projected on similar parts of the two retinae; but it is also evident that two exactly similar objects may be made to fall on similar parts of the two retinae, if they are placed one in the direction of each optic axis, at equal distances before or beyond their intersection.
Fig. 2. represents the usual situation of an object at the intersection of the optic axes. In fig. 3. the similar objects are placed in the direction of the optic axes before their intersection, and in fig. 4. beyond it. In all these three cases the mind perceives but a single object, and refers it to the place where the optic axes meet. It will be observed that when the eyes converge beyond the objects, as in fig. 3., the right hand object is seen by the right eye, and the left hand object by the left eye; while when the axes converge nearer than the objects, the right hand object is seen by the left eye, and conversely. As both of these modes of vision are forced and unnatural, eyes unaccustomed to such experiments require some artificial assistance. If the eyes are to converge beyond the objects, this may be afforded by a pair of tubes (fig. 5.) capable of being inclined towards each other at various angles, so as to correspond with the different convergences of the optic axes. If the eyes are to converge at a nearer distance than that at which the objects are placed, a box (fig. 6.) may be conveniently employed; the objects a a' are placed distant fromg each other, on a stand capable of being moved nearer the eyes if required, and the optic axes being directed towards them will cross at c, the aperture b V allowing the visual rays from the right hand object to reach the left eye, and those froin the left hand object to fall on the right eye; the coincidence of the images may be facilitated by placing the point of a needle at the point of intersection of the optic axes c, and fixing the eyes upon it. In both these instruments (figs. 5. and 6.) the lateral images are hidden from view, and much less difficulty occurs in making the images unite than when the naked eyes are employed.
Now if, instead of placing two exactly similar objects to be viewed by the eyes in either of the modes above described, the two perspective projections of the same solid object be so disposed, the mind will still perceive the object to be single, but instead of a representation on a plane surface, as each drawing appears to be wvhenl separately
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
374 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
viewed by that eye which is directed towards it, the observer will perceive a figure of three dimensions, the exact counterpart of the object from which the drawings were made. To nake this matter clear I will mention one or two of the most simple cases.
If two vertical lines near each other, but at different distances from the spectator, be regarded first with one eye and then with the other, the distance between them when referred to the same plane will appear different; if the left hand line be nearer to the eyes, the distance as seen by the left eye will be less than the distance as seen by the right eye; fig. 7. will render this evident; a a' are vertical sections of the two original lines, and b b the plane to which their projections are referred. Now if the two lines be drawn on two pieces of card, at the respective distances at which they appear to each eye, and these cards be afterwards viewed by either of the means a directed, the observer will no longer see two lines on a plane surface, as each card separately shows; but two lines will appear, one nearer to him than the other, pre.cisely as the original vertical lines themselves. Again, if a straight wire be held b fore the eyes in such a position that one of its ends shall be nearer to the lobserver than the other is, each eye separately referring it to a plane perpendicular to the common axis, wvill see a line differently inclined; and then if lines having the same apparent inclinations be drawn on two pieces of card, and be presented to the eyes as before directed, the real position of the original line will be correctly perceived
the mind. In the same manner the most complex figures of three dimensions may be accu-
rately represented to the mind, by presenting their two perspective projections to th two retinas. But I shall defer these more perfect experiments until I describe an instrument which will enable any person to observe all the phenomena in question with the greatest ease and certainty.
In the instruments above describerd the optic axes converge to some point in a plane before or beyond that in which the objects to be seen are situated. The adaptation of the eve, which enables us to see distinctly at diffierent distances, and which habitually accompanies every different degree of convergence of the optic axes, does not immediately adjust itself to the new and unusual condition; and to persons not accustomed to experiments of this kind, the pictures will either not readily unite, or will appear dimn and confused. Besides this, no object can be viewed according to either tiodle when the drawings exceed in breadth the distance of the two points of optic axes in which their centres are placed.
T'11ese inconveniences are removed by the instrument I am. about to describe; the two pictures (or rather their reflected images) are placed in it at the true concourse of the optic axes, the focal adaptation of the eye preserves its usual adjustment, the appearance of lateral images is entirely avoided, and a large field of view for each eye is obtained. 'The frequent reference I shall have occasion to make to this instrument, will render it convenient to give it a specific name, I therefore propose that it be called a Stereoscope, to indicate its property of representing solid figures.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON TUIE PHYSIOLOGY OF VISION. 375
? 3.
The stereoscope is represented by figs. 8. and 9; the former being a front view, and the latter a plan of the instrument. A A' are two plane mirrors, about foul inches square, inserted in frames, and so adjusted that their backs form an angle of 900 with each other; these mirrors are fixed by their common edge against an upright B, or which was less easy to represent in the drawing, against the middle line of a vertical board, cut away in such manner as to allow the eyes to be placed before the two mirrors. C C' are two sliding boards, to which are attached the upright boards D D', which may thus be removed to different distances firom the mirrors. In most of the experiments hereafter to be detailed, it is necessary that each upright board shall be at the same distance from the mirror which is opposite to it. To facilitate this double adjustment, I employ a right and a left-handed wooden screw, r 1; the two ends of this compound screw pass through the nuts e e', which are fixed to the lower parts of the upright boards D D', so that by turning the screw pin p one rvay the two boards will approach, and by turniDg it the other they will recede from each other, one always preserving the same distance as the other from the middle linef. E E' are pannels, to which the pictures are fixed in such manner that their corresponding horizontal lines shall be on the same level: these pannels are capable of sliding backwards and forwards in grooves on the upright boards D D'. The apparatus having been described, it nowv remains to explain the manner of using it. T observer must place his eyes as near as possible to the mirrors, the right eye before the right hand mirror, and the left eye before the left hand mirror, and he must mov the sliding pannels E E' to or from him until the two reflected images coincide at the intersection of the optic axes, and form an image of the same apparent magnitude as each of the component pictures. The pictures will indeed coincide when the sliding pannels are in a variety of different positions, and consequently when viewed under different inclinations of the optic axes; but there is only one position in which the binocular image will be immediately seen single, of its proper m-agnitude, and with fatigue to the eyes, because in this position only the ordinary relations between the magnitude of the pictures on the retina, the inclination of the optic axes, and the adaptation of the eye to distinct vision at different distances are preserved. The alteration in the apparent magnitude of the binocular images, when these usual relations are disturbed, will be discussed in another paper of this series, with a variety of reinarkable phenomena depending thereon. In all the experiments detailed in the present memoir I shall suppose these relations to remain undisturbed, and the optic axes to converge about six or eight inches before the eyes.
If the pictures are all drawn to be seen with the same inclination of the optic axes, the apparatus may be simplified by omitting the screw r I and fixing the upright boards D D' at the proper distances. The sliding pannels may also be dispensed with, and the drawings themselves be made to slide in the grooves.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
:376 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
? 4.
A few pairs of outline figures, calculated to give rise to the perception of objects of three dimensions when placed in the stereoscope in the manner described, are represented from figs. 10. to 20. They are one half the linear size of the figures actually employed. As the drawings are reversed by reflection in the mirrors, I will suppose these figures to be the reflected images to which the eyes are directed in the apparatus; those marked b being seen by the right eye, and those marked a by the left eye. The drawings, it has been already explained, are two different projections of the same object seen from two points of sight, the distance between which is equal to the interval between the eyes of the observer; this interval is generally about 22 inches.
a and b, fig. 10. will, when viewed in the stereoscope, present to the mind a line in the vertical plane, with its lower end inclined towards the observer. If the two component lines be caused to turn round their centres equally in opposite directions, the resultant line will, while it appears to assume every degree of inclination to the referent plane, still seem to remain in the same vertical plane.
Fig. 1 1. A series of points all in the same horizontal plane, but each towards the right hand successively nearer the observer.
Fig. 12. A curved line intersecting the referent plane, and having its convexity towards the observer.
Fig. 13. A cube. Fig. 14. A cone, having its axis perpendicular to the referent plane, and its vertex towards the observer. Fig. 15. The frustum of a square pyramid; its axis perpendicular to the referent plane, an d its base furthest from the eye. Fig. 16. Two circles at different distances from the eyes, their centres in the same perpendicular, forming the outline of the frustum of a cone. The other figures require no observation.
For the purposes of illustration I have employed only outline figures, for had either shading or colouring been introduced it might be supposed that the effect was wholly or in part due to these circumstances, whereas by leaving them out of consideration no room is left to doubt that the entire effect of relief is owing to the simultaneous perception of the two monocular projections, one on each retina. But if it be required to obtain the most faithful resemblances of real objects, shadowing and colouring may properly be employed to heighten the effects. Careful attention would enable an artist to draw and paint the two component pictures, so as to present to the mind of the observer, in the resultant perception, perfect identity with the object represented. Flowers, crystals, busts, vases, instruments of various kinds, &c., might thus be represented so as not to be distinguished by sight from the real objects them-
selves.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 377
It is worthy of remiark, that the process by which w the real forms of solid objects, is precisely that which is employed in descriptive geometry,, an important science we owe to the gelliUs of MONGE, but which is little studied or known in this country. In this science, the position of a point, a right line or a curve, and consequently of any figure whatever, is completely determined by assigning its projections on two fixed planes, the situations of which are known, and which are not parallel to each other. In the problems of descriptive geometry the two referent planes are generally assumed to be at right angles to each other, but in binocular vision the inclination of these planes is less according as the angle made at the concourse of the optic axes is less; thus the samne solid object is represented to the mind by different pairs of tronocular pictures, according as they are placed a different distance before the eyes, and the perception of these dlifferences (though we seem to be unconscious of them) may assist in suggesting to the rmind the distanc of the object. The more inclined to each other the referent planes are, with the greater accuracy are the various points of the projections referred to their proper places; and it appears to be a useful provision that the real formas of those objects which are nearest to u1s are thus more dletelrminately apprehlendled than those whlich are more distant.
4 5.
A very singular effect is produced when the drawing originally intended to be seen by the right eye is placed at the left hand side of the stereoscope, and that designed to be seen by the left eye is placed on its right hand side. A figure of three dimensions, as bold in relief as before, is perceived, but it has a different form from that which is seen when the drawings are in their proper places. There is a certain relation between the proper figure and this, which I shall call its converse figure. Those points which are nearest the observer in the proper figure are the most remote from him 'in the converse figure, and vice versd, so that the figure is, as it were, inverted; but it is not an exact inversion, for the near parts of the converse figure appear smaller, and the remote parts larger than the same parts before the inversion. Hence the drawings which, properly placed, occasion a cube to be perceived, when changed in the manner described, represent the frusturm of a square pyramid with its base reinote from the eye: the cause of this is easy to understand.
This conversion of relief may be shown by all the pails of drawings from fig. 10. to 19. In the case of simple figures like these the converse figure is as readily appre^ hended as the original one, because it is generally a figure of as frequent occurrence; but in the case of a more coinplicated figure, an architectural design, for instance, the mind, unaccustomed to perceive its converse, because it never occurs in nature, can find no meaning in it.
MnCCCXXXVlml. 3 c
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
378 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
? 6.
The same image is depicted on the retina by an object of three dimensions as by its projection on a plane surface, provided the point of sight remain in both cases the same. There should be, therefore, no difference in the binocular appearance of two drawings, one presented to each eye, and of two real objects so presented to the two eyes that their projections on the retina shall be the same as those arising from the drawings. The following experiments will prove the justness of this inference.
I procured several pairs of skeleton figures, i. e. outline figures of three dimensions, formed either of iron wire or of ebony beading about one tenth of an inch in thickness. The pair I most frequently employed consisted of two cubes, whose sides were three inches in length. When I placed these skeleton figures on stands before the two mirrors of the stereoscope, the following effects were produced, according as their relative positions were changed. 1 st. When they were so placed that the pictures which their reflected images projected on the two retinu were precisely the same as those which would have been projected by a cube placed at the concourse of the optic axes, a cube in relief appeared before the eyes. 2ndly. When they were so placed that their reflected images projected exactly similar pictures on the two all effect of relief was destroyed, and the compound appearance was that of an outline representation on a plane surface. 3rdly. When the cubes were so placed that the reflected image of one projected on the left retina the same picture as in the first case was projected on the right retina, and conversely, the converse figure in relief
appeared.
?7. If a symmetrical object, that is one whose right and left sides are exactly similar to each other but inverted, be placed so that any point in the plane which divides it into these two halves is equally distant from the two eyes, its two monocular projections are, it is easy to see, inverted fac-siniles of each other. Thus fig. 15, a and b are symmetrical monocular projections of the frustum of a four-sided pyramid, and figs. 13. 14. 16. are corresponding projections of other symmetrical objects. This being kept in view, I will describe an experiment which, had it been casually observed previous to the knowledge of the principles developed in this paper, would have appeared an inexplicable optical illusion. M and M' (fig. 21.) are two mirrors, inclined so that their faces form an angle of 90? with each other. Between them in the bisecting plane is placed a plane outline figure, such as fig. 15 a, made of card all parts but the lines being cut away, or of wire. A reflected image of this outline, placed at A, will appear behind each mirror at B and B', and one of these images will be the inversion of the other. If the eyes be made to converge at C, it is obvious that these two reflected images will fall on corresponding parts of the two retinae, and a figure of three dimensions will be perceived - if the outline placed in the bisecting plane be reversed, the converse skeleton
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 379
form wvill appear; in both these experiments we have the singular phenomenon of the conversion of a single plane outline into a figure of three dimensions. To render the binocular object more distinct, concave lenses may be applied to the eyes; and to prevent the two lateral images from being seen, screens may be placed at D and D'.
? S. An effect of binocular perspective may be remarked in a plate of metal, the surface of which has been made smooth by turning it in a lathe. When a single candle is brought near such a plate, a line of light appears standing out from it, one half being above, and the other half below the surface; the position and inclination of this line changes with the situation of the light and of the observer, but it always passes through the centre of the plate. On closing the left eye the relief disappears, and the
luminous line coincides with one of the diameters of the plate; oil closing the right eye the line appears equally in the plane of the surface, but coincides with another diameter; on opening both eyes it instantly starts into relief*. The case here is exactly analogous to the vision of two inclined lines (fig. 10.) when each is presented to a different eye in the stereoscope. It is curious, that an effect like this, which must have been seen thousands of times, should never have attracted sufficient attention to have been made the subject of philosophic observation. It was one of the earliest facts which drew my attention to the subject I am now treating.
Dr. SMITHt was very much puzzled by an effect of binocular perspective which hie observed, hut was unable to explain. He opened a pair of compasses, and while he held the joint in his hand, and the points outwards and equidistant from his eyes, and somewhat higher than the joint, he looked at a more distant point; the cornpasses appeared double. He then compressed the legs until the two inner points coincided; having done this the two inner legs also entirely coincided, and bisected the angle formed by the outward ones, appearing longer and thicker than they did, and reaching from the hand to the remotest object in view. The explanation offered by Dr. SMITH accounts only for the coincidence of the points of the compasses, not for that of the entire leg. The effect in question is best seen by employing a pair of straight wires, about a foot in length. A similar observation, made with two flat rulers, and afterwards with silk threads, induced Dr. WELLS to propose a new theory of visible direction in order to explain it, so inexplicable did it seem to him by any
of the received theories.
* The luminous line seen by a single eye arises from the reflection of the light from each of the co circles produced in the operation of turning; when the plate is not large the arrangement of these su
reflections does not differ from a straight line.
t System of Optics, vol. ii. p. 388. and r. 526.
3 c 2
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
380 AEMR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
? 9.
The preceding experiments render it evident that there is an essential difference in the appearance of objects when seen with two eyes, and when only one eye is employed, and that the most vivid belief of the solidity of an object of three dimensions arises from two different perspective projections of it being simultaneously presented to the mind. I-low happens it then, it may be asked, that persons who see with only one eye forin correct notions of solid objects, and never mistake them for pictures ? and how happens it also, that a person having the perfect use of both eyes, perceives no difference in objects around him when he shuts one of them . To explain these apparent difficulties, it must lie kept in mind, that although the simultaneous vision of two dissimilar pictures suggests the relief of objects in the most vivid mranner, y there are other signs which suggest the same ideas to the mindu, which, though mor ambiguous than the former, become less liable to lead the judgment castray in proportion to the extent of our previous experience. Trle vividness of relief arising fiom the projection of two (lissimilar pictures, one on each retina, becomes less and less as the object is seen at a greater distance before the eyes, and entirely ceases when it is so distant that the optic axes are parallel while regarding it. We see with both eyes all objects beyond this distance precisely as we see near objects with a single eye; for the pictures on the two retinve are then exactly similar, and the mind appreciates no difference whether two identical pictures fall on corresponding parts of the two reting, or whether one eye is impressed with only one of these pictures. A person deprived of the sight of one eye sees therefore all external objects, near and remote, as a person with both eyes sees remote objects only, but that vivid effect arising from the binocular vision of near objects is not perceived l)y the former; to supply this deficiency he has recourse unconsciously to other means of acquiring more accurate information. The motion of the head is the principal means he employs. Trlat the required knowledge may be thus obtained will be evident from the following considerations. The mind associates with the idea of a solid object every different projection of it which experience has hitherto afforded; a single projection may be amnbiguous, from its being also one of the projections of a picture, or of a different solid object; but when different projections of the same object are successively presented, they cannot all belong to another object, and the formn to which they belong is completely characterized. While the object remnains fixed, at every movement of the head it is viewed from a different point of sight, and the picture on the retina consequently continually changes.
Every one must be aware how greatly the perspective effect of a picture is enhanced by looking at it with only one eye, especially when a tube is employed to exclude the vision of adjacent objects, whose presence might disturb the illusion. Seen under such circumstances fromn the proper point of sight, the picture projects the sam lines? shades and colours on the retina, as the more distant scene which it represents
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 381
would do were it substituted for it. The appearance which would make us certain that it is a picture is excluded from the sight, and the imagination has room to be active. Several of the older writers erroneously attributed this apparent superiority of monocular vision to the concentration of the visual power in a single eye *.
There is a well-known and very striking illusion of perspective which deserves a passing remark, because the reason of the effect does not appear to be generally understood. When a perspective of a building is projected on a horizontal plane, so that the point of sight is in a line greatly incline d towards the plane, the building appears to a single eye placed at the point of simht to be in bold relief, and the illusion
is almost as perfect as in the binocular experiments described in ?t 2, 3, 4. Th
effect wholly arises from the unusual projection, which suggests to the mind more readily the object itself than the drawing of it; for we are accustomed to see real objects in almost every point of view, but perspective representations being generally nade in a vertical plane with the point of sight in a line perpendicular to the plane of projection, we are less familiar with the appearance of other projections. Any other unusual projection will produce the same effect.
10.
If we look with a single eye at the drawing of a solid geometrical figure, it may be imagined to be the representation of either of two dissimilar solid figures, the f intended to be represented, or its converse figure (? 5.). If the former is a very usual and the latter a very unusual figure, the imagination will fix itself on the original without wandering to the converse figure; but if both are of ordinary occurrence, which is generally the case with regard to simple forms, a singular phenomenon takes place; it is perceived at one time distinctly as one of these figures, at another time as the other, and while one figure continues it is not in the power of the will to change it immediately.
The same phenomenon takes place, though less decidedly, when the drawing is seen with both eyes. Many of my readers will call to mind the puzzling effect of some of the diagrams annexed to the problems of the eleventh book of Euclid; which, when they were attentively looked at, changed in an arbitrary manner from one solid figure to another, and would obstinately continue to present the converse figures when the real figures alone were wanted. This perplexing illusion must be of comnmon occurrence, but I have only found one recorded observation relating to the subject. It is by Professor NECKER of Geneva, and I shall quote it in his own words from the Philosophical Magazine, Third Series, vol. i. p. 337.
"The object I have now to call your attention to is an observation which has often
* "C We see more exquisitely with one eye shut than with both, because the vital spirits thus unite themselves the more, and become the stronger: for we may find by looking in a Class whilst we shut one eye} that the pupil of the other dilates."-Lord BACON'S Works, Sylva Sylvarumn, art. Vision.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
382 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
occurred to me while examining figures and engraved plates of crystalline forms; I mean a sudden and involuntary change in the apparent position of a crystal or solid represented in an engraved figure. What I mean wvill be more easily understood from the figure annexed (fig. 22.). The rhomboid A X is drawn so that the solid angle A should be seen the nearest to the spectator, and the solid angle X the farthest from him, and that the face A C D B should be the foremost, while the face X D C is behind. But in looking repeatedly at the same figure, you will perceive that at times the apparent position of the rhomboid is so changed that the solid angle X will ap-
pear the nearest, and the solid angle A the farthest; and that the face A
recede behind the face X D C, which will come forward, which effect gives to the whole solid a quite contrary apparent inclination."
Professor NECKER attributes this alteration of appearance, not to a mental operation, but to an involuntary change in the adjustment of the eye for obtaining distinct vision. iHe supposed that whenever the point of distinct vision on the retina is directed on the angle A, for instance, this angle seen more distinctly than the others is naturally supposed to be nearer and foremost, while the other angles seen indistinctly are supposed to be farther and behind, and that the reverse takes place when the point of distinct vision is brought to bear on the angle X.
That this is not the true explanation, is evident from three circumstances: in the first place, the two points A and X being both at the same distance from the eyes, the same alteration of adjustment which would make one of them indistinct would make the other so; secondly, the figure will undergo the same changes whether the focal distance of the eye be adjusted to a point before or beyond the plane in which the figure is drawn; and thirdly, the change of figure frequently occurs while the eye continues to look at the same angle. The effect seems entirely to depend on our mental contemplation of the figure intended to be represented, or of its converse. By following the lines with the eye with a clear idea of the solid figure we are describing, it may be fixed for any length of time; but it requires practice to do this or to change the figure at will. As I have before observed, these effects are far more obvious when the figures are regarded with one eye only.
No illusion of this kind can take place when an object of three dimensions is seen xwith both eyes while the optic axes make a sensible angle with each other, because the appearance of the two dissimilar images, one to each eye, prevents the possibility of mistake. But if we regard an object at such a distance that its two projections aare sensibly identical, and if this projection be capable of a double interpretation, the illusion may occur. Thus a placard on a pole carried in the streets, with one of its sides inclined towards the observer, will, when he is distant from it, frequently appear inclined in a contrary direction. Many analogous instances might be adduced, but this will suffice to call others to mind; it must however be observed, that when shadows, or other means capable of determining the judgement are present, these fallacies do not arise.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. :383
? 11.
The same indetermination of judgement which causes a drawing to be perceived by the mind at different times as two different figures, frequently gives rise to a false perception when objects in relief are regarded wvith a single eye. The apparent conversion of a cameo into an intaglio, and of an intaglio into a cameo, is a well-known instance of this fallacy in vision; but the fact does not appear to me to have been correctly explained, nor the conditions under which it occurs to have been properly stated.
This curious illusion, which has been the subject of munch attention, was first observed at one of the early meetings of the Royal Society*. Several of the members looking through a compound microscope of a new construction at a guinea, some of them imagined the image to be depressed, while others thought it to be embossed, as it really was. Professor GMELIN, of Wurtemburg, published a paper on the same subject in the Philosophical Transactions for 1745; his experiments were made with telescopes and compound microscopes which inverted the images; and he observed that the conversion of relief appeared in some cases and not in others, at some times and not at others, and to some eyes also and not to others. He endeavoured to ascertain some of the conditions of the two appearances; " but why these things should so happen," says he, s" I do not pretend to determine."
Sir DAVID BREWSTER accounts for the fallacy in the following mnannert:-" A hollow seal being illuminated by a window or a candle, its shaded side is of course on the same side with the light. If we now invert the seal with one or more lenses, so that it may look in the opposite direction, it will appear to the eye with the shaded side furthest from the window. But as we know that the window is still on our left hand, and as every body with its shaded side furthest from the light must necessarily be convex or protuberant, we immediately believe that the hollow seal is now a cameo or bas-relief. The proof which the eye thus receives of the seal being raised, overcomes the evidence of its being hollow, derived from our actual knowledge and from the sense of touch. In this experiment the deception takes place from our knowing the real direction of the light which falls on the seal; for if the place of the window, with respect to the seal, had been inverted as well as the seal itself, the illusion could not have taken place. The illusion, therefore, under our consideration is the result of an operation of our own minds, whereby we judge of the forms of bodies by the knowledge we have acquired of light and shadow. Hence the illusion depends on the accuracy and extent of our knowledge on this subject; and while some persons are under its influence, others are entirely insensible to it."
These considerations do not fully explain the phenomenon, for they suppose that the image must be inverted, and that the light must fall in a particular direction; but the conversion of relief will still take place when the object is viewed through an
* BIRcu's History, vol. ii. p. 348. t Natural Magic, p. 100.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
384 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
open tube without any lenses to invert it, and also when it is equally illuminated in all parts. The true explanation I believe to be the following. If we suppose a cameo and an intaglio of the same obect, the elevations of the one corresponding exactly to the depressions of the other, it is easy to show that the projection of either on the retina is sensibly the same. When the cameo or the intaglio is seen with both eves, it is impossible to mistake an elevation for a depression, for reasons which have been already amply explained; but when either is seen with one eye only, the most certain guide of our judgement, viz. the presentation of a different picture to each eye, is wanting; the imagination therefore supplies the deficiency, and we conceive the object to be raised or depressed according to the dictates of this faculty. No doubt in such cases our judgement is in a great degree influenced by accessory circumstances and the intaglio or the relief mnlay sometimes present itself according to our prev knowledge of the direction in which the shadows ought to appear; but the real cause of the phenomenon is to be found in the indetermination of the judgernent arising from our more per'fect means of judging being absent.
Observers with the microscope must be particularly on their guard against illusions of this kind. RASPAIL observes* that the hollow pyramidal arrangement of the crystals of muriate of soda appears, when seen through a microscope, like a striated pyramid in relief. He recommends two modes of correcting the illusion. The first is to bring successively to the focus of the instrument the different parts of the crystal; if the pyramid be in relief, the point will arrive at the focus sooner than the base will ; if the pyramid be hollow, the contrary will take place. The second mode is to project a strong light on the pyramid in the field of view of the microscope, and to observe which sides of the crystal are illuminated, taking however the inversion of the image into consideration if a compound microscope be employed.
The inversion of relief is very striking when a skeleton cube is looked at with one eye, and the following singular results may in this case be observed. So long as the mind perceives the cube, however the figure be turned about, its various appearances will be but different representations of the same object, and the same primnitive form will be suggested to the mind by all of them: 'but it is not so if the converse figure fixes the attention; the series of successive projections cannot then be referred to any figure to which they are all common, and the skeleton figure will appear to be continually undergoing a change of shape.
? 12.
I have given ample proof that objects whose pictures do not fall on corresponding points of the two retinue may still appear single. I will now adduce an experiment which proves that similar pictures falling on corresponding points of the two retinae mnay appear double and in different places.
Present, in the stereoscope, to the right eye a vertical line, and to the left eve a
* Nouveau Systame de Chimnie Organique, 2mR? edit. to 1. pi 333.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 385
line inclined some degrees from the perpendicular (fig. 23.); the observer will then perceive, as formerly explained, a line, the extremities of which appear at different distances before the eyes. Draw on the left hand figure a faint vertical line exactly corresponding in position and length to that presented to the right eye, and let the two lines of this left hand figure intersect each other at their centres. Looking now at these two drawings in the stereoscope, the two strong lines, each seen by a different eye, will coincide, and the resultant perspective line will appear to occupy the same place as before; but the faint line which now falls on a line of the left retina, which corresponds with the line of the right retina on which one of the coinciding strong lines, viz. the vertical one, falls, appears in a different place. The place this faint line apparently occupies is the intersection of that plane of visual direction of the left eye in which it is situated, with the plane of visual direction of the right eye, which contains the strong vertical line.
This experiment affords another proof that there is no necessary physiological connection between the corresponding points of the two retinas,-a doctrine which has been maintained by so many authors.
? 13. Binocular Vision of Jmages of different M1agnitudes. We will now inquire what effect results fron presenting similar images, differing only in magnitude, to analogous parts of the two retinve. For this purpose two squares or circles, differing obviously but not extravagantly in size, may be drawn on two separate pieces of paper, and placed in the stereoscope so that the reflected image of each shall be equally distant from the eye by which it is regarded. It will then be seen that, notwithstanding this difference, they coalesce and occasion a single resultant perception. The limit of the difference of size within which the single ap-. pearance subsists may be ascertained by employing two images of equal magnitude, and causing one of them to recede from the eye while the other remains at a constant distance; this is effected merely by pulling out the sliding board C (fig. 8.) while the other C' remains fixed, the screw having previously been removed. Though the single appearance of two images of different size is by this experiment demonstrated, the observer is unable to perceive what difference exists between the apparent magnitude of the binocular image and that of the two monocular images; to determine this point the stereoscope must be dispensed with, and the experiment so arranged that all three shall be simultaneously seen; which may be done in the following manner :-The two drawings being placed side by side on a plane before the eyes, the optic axes must be made to converge to a nearer point as at fig. 4., or to a more distant one as at fig. 3., until the three images are seen at the same time, the binocular image in the middle, and the monocular images at each side. It will thus hge seen that the binocular image is apparently intermediate in size between the two monocular ones. If the pictures be too unequal in magnitude, the binocular coincidence does not
MDICCCXXXVIII. 3 D
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
386 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
take place. It appears that if the inequality of the pictures be greater than the difference which exists between the two projections of the same object when seen in th most oblique position of the eyes (i. e. both turned to the extreme right or to the extrerne left), ordinarily employed, they do not coalesce. Were it not for the binocular coincidence of two images of different magnitude,- objects would appear single only when the optic axes converge immediately forwards; for it is only when the converging visual lines form equal angles with the visual base (the line joining the centres of the two eyes) as at fig. 2., that the two pictures can be of equal magnitude; but when they form different angles with it, as at fig. 24., the distance from the object to each eye is different, and consequently the picture projected on each retina has a different magnitude. If a piece of money be held in the position a, (fig. 24.) while the optic axes converge to a nearer point c, it will appear double, and that seen by the left eye will be evidently smaller than the other.
? 14. Phenomena which are observed when objects of djfferent forms are simultaneously presented to corresponding parts of the two retincc.
If we regard a picture with the right eye alone for a considerable length of time it will be constantly perceived; if we look at another and dissimilar picture with the left eye alone its effect will be equally permanent; it might therefore be expected, that if each of these pictures were presented to its corresponding eye at the same time the two would appear permanently superposed on each other. This, however, contrary to expectation, is not the case.
If a and b (fig. 25.) are each presented at the same time to a different eye, the common border will remain constant, while the letter within it will change alternately from that which would be perceived by the right eye alone to that which would be perceived by the left eye alone. At the moment of change the letter which has just been seen breaks into fragments, while fragments of the letter which is about to appear iningle with them, and are immediately after replaced by the entire letter. It does not appear to be in the power of the will to determine the appearance of either of the letters, but the duration of the appearance seems to depend on causes which are under our control: thus if the two pictures be equally illuminated, the alterlnations appear in general of equal duration; but if one picture be inore illuminated than the other, that which is less so will be perceived during a shorter time. I have generally made this experiment with the apparatus, fig. 6. When complex pictures are employed in the stereoscope, various parts of them alternate differently.
There are some facts intimately connected with the subject of the present article which have already been frequently observed. I allude to the experiments, first made by Du Toun, in which two different colours are presented to corresponding parts of the two retinue. If a blue disc be presented to the right eye and a yellow disc to the corresponding part of the left eye, instead of a green disc which would appear if these
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 387
two colours had mingled before their arrival at a single eye, the mind will perceive the two colours distinctly one or the other alternately predominating either partially or wholly over the disc. In the same manner the mind perceives no trace of violet when red is presented to one eye and blue to the other, nor any vestige of orange when red and yellow are separately presented in a similar manner. These experiments may be conveniently repeated by placing the coloured discs in the stereoscope, but they have been most usually made by looking at a white object through differently coloured glasses, one applied to each eye.
In some authors we find it stated, contrary to fact, that if similar objects of different colour be presented one to each eye, the appearance will be that compounded of the two colours. Dr. REID* and JANIN are among the writers who have fallen into this inconsiderate error, which arose no doubt from their deciding according to previous notions, instead of ascertaining by experiment what actually does happen.
? 15.
No question relating to vision has been so mnuch debated as the cause of the sing appearance of objects seen by both eyes. I shall in the present section give a slight review of the various theories which have been advanced by philosophers to account for this phenomenon, in order that the remarks I have to make in the succeeding
section may be properly understood. The law of visible direction for monocular vision has been variously stated by dif-
ferent optical writers. Some have maintained with Drs. REID and PORTERFIELD, that every external point is seen in the direction of a line passing from its picture on the retina through the centre of the eye; while others have supposed with Dr. SMITH that the visible direction of an object coincides with the visual ray, or the principal ray of the pencil which flows from it to the eye. D'ALEMBERT, furnished with imperfect data respecting the refractive densities of the humours of the eye, calculated that the apparent magnitudes of objects would differ widely on the two suppositions, and concluded that the visible point of an object was not seen in either of these directions, but sensibly in the direction of a line joining the point itself and its image on the retina; but he acknowledged that he could assign no reason for this law. Sir DAVID BREWSTER, provided with more accurate data, has shown that these three lines so nearly coincide with each other, that "; at an inclination of 300, a line perpendicular to the point of impression on the retina passes through the common centre, and does not deviate from the real line of visible direction more than half a degree, a quantity too small to interfere with the purposes of vision." We may, therefore, assume in all our future reasonings the truth of the following definition given by this eminent philosopher :-- As the interior eye-ball is as nearly as possible a perfect sphere, lines perpendicular to the surface of the retina must all pass through one single point,
* Enquiry, Sect. xiii.
3 D 2
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
388 MIR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
namely the centre of its spherical surface. This one point may be called the centre of visible direction, because every point of a visible object will be seen in the direction of a line drawn from this centre to the visible point."
It is obvious, that the result of any attempt to explain the single appearance of objects to both eyes, or, in other words, the law of visible direction for binocular vision, ought to contain nothing inconsistent with the law of visible direction for monocular vision. ,
It was the opinion of AGUILONIUS, that all objects seen at the same glance with both eyes appear to be in the plane of the horopter. The horopter he defines to be a line drawn through the point of intersection of the optic axes, and parallel to the line joining the centres of the two eyes; the plane of the horopter to be a plane passing through this line at right angles to that of the optic axes. All objects which are 'in this plane, nust, according to him, appear single because the lines of direction in which any point of an object is seen coincide only in this plane and nowhere else; and as these lines can meet each other only in one point, it follows from the hypothesis, that all objects not in the plane of the horopter must appeal double, because their lines of direction intersect each other, either before or after they pass through it. This opinion was also maintained by DECHALES and PORTERFIELD. That it is erroneous, I have given, I think, sufficient proof, in showing that, when the optic axes converge to any point, objects before or beyond the plane of the horopter are under certain circumstances equally seen single as those in that plane.
Dr. WELLS'S " new theory of visible direction" was a modification of the preceding hypothesis. This acute writer held with AGUILONIUS, that objects are seen single only when they are in the plane of the horopter, and consequently that they appear double when they are either before or beyond it; but hie attempted to make this single appearance of objects only in the plane of the horopter to depend on other principles, from which he deduced, contrary to AGUILONIUS, that the objects which are doubled do not appear in the plane of the horopter, but in other places which are determined by these principles. Dr. WELLS was led to his new theory by a fact which he accidentally observed, and which he could not reconcile with any existing theory of visible direction; this fact had, though he was unaware of it, been previously noticed by Dr. SMITH; it is already mentioned in ? 8., and is the only instance of binocular vision of relief which I have found recorded previous to my own investigations. So little does Dr. WELLS'S theory appeal to have been understood, that no subsequent writer has attempted either to confirm or disprove his opinions. It would be useless here to discuss the principles of this theory, which was fi'amed to account for an anomalous individual fact, since it is inconsistent with the general rules on which that fact has been now shown to depend. Notwithstanding these erroneous views, the "c essay upon single vision with two eyes" contains many valuable experiments and remarks, the truth of which are independent of the theory they were intended to illustrate.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 389
The theory which has obtained greatest currency is that which assumes that an object is seen single because its pictures fall on corresponding points of the two retinu, that is on points which are similarly situated with respect to the two centres both in distance and position. This theory supposes that the pictures projected on the retinae are exactly similar to each other, corresponding points of the two pictures falling on corresponding points of the two retinae. Authors who agree with regard to this property, differ widely in explaining why objects are seen in the same place, or single, according to this law. Dr. SMITH makes it to depend entirely on custom, and explains why the eyes are habitually directed towards an object so that its pictures fall on corresponding parts in the following manner:-"A When we view an object steadily, we have acquired a habit of directing the optic axes to the point in view; because its pictures falling upon the middle points of the retinas, are then distincter than if they fell upon any other places; and since the pictures of the whole object are equal to one another, and are both inverted with respect to the optic axes, it follows that the pictures of any collateral point are painted upon corresponding points of the retinas."
Dr. REID, after a long dissertation on the subject, concludes, " that by an original property of human eyes, objects painted upon the centres of the two retinae, or upon points similarly situated with regard to the centres, appear in the same visible place; that the most plausible attempts to account for this property of the eyes have been unsuccessful; and therefore, that it must be either a primary law of our constitution, or the consequence of some more general law which is not yet discovered."
Other writers who have admitted this principle have regarded it as arising from anatomical structure and dependent on connexion of nervous fibres; among these stand the names of GALEN, Di. BRiGGs, Sir ISAAc NEWTON, ROHAULT, Dr. HARTLEY, Dr. WOLLASTON and Professor MiTLLER.
Many of the supporters of the theory of corresponding points have thought, or rather have admitted, without thinking, that it was not inconsistent wvith the law of AGUILONIus; but very little reflection will show that both cannot be maintained together; for corresponding lines of visible direction, that is, lines terminating in corresponding points of the two retinue, cannot meet in the plane of the horopter unless the optic axes be parallel, and the plane be at an infinite distance before the eyes. Some of the modern German writers* have inquired what is the curve in which objects appear single while the optic axes are directed to a given point, on the hypothesis that objects are seen single only when they fall on corresponding points of the two retinue. An elegant proposition has resulted from their investigations, which I shall need no apology for introducing in this place, since it has not yet been mentioned in any English work.
R and L (fig. 26.) are the two eyes; C A, C' A the optic axes converging to the
* Tortual, die Sinne des Menschen. MUnster, 1827. Bartels, Beitrage zur Physiologie der Gesich
Berlin, 1834.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
390 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
point A; and C A B C' is a circle drawn through the point of convergence A and the centres of visible direction C C'. If any point be taken in the circumference of this circle, and lines be drawn from it through the centres of the two eyes C C', these lines wvill fall on corresponding points of the two retine D ID'; for the angles A C B A C' B being equal, the angles D C E, D C' E are also equal; therefore anv point placed in the circumference of the circle C A B C' will, according to the hypothesis, appear single while the optic axes are directed to A, or any other part in it.
I will mention two other properties of this binocular circle: 1st. rhe are subtended by two points on its circumference contains double the number of degrees of the arc subtended by the pictures of these points on either retina, so that objects which occupy 180? of the supposed circle of single vision are painted on a portion of the retina extended over 90' only; for the angle D C E or D C' E being at the centre, and the angle B C A or B C' A at the circumference of a circle, this consequence follows. 2ndly. To Whatever point of the circumference of the circle the optic axes be made to converge, they will form the same angle with each other; for the angles C A C', C B C are equal.
In the eye itself, the centre of visible direction, or the point at which the principal rays cross each other, is, according to Dr. YOUNG and other eminent optical writers, at the same time the centre of the spherical surface of the retina, and that of the lesser spherical surface of the cornea; in the diagram (fig. 26.), to simplify the consideration of the problem, R and L represent only the circle of curvature of the bottom of the retina, but the reasoning is equally true in both cases.
The same reasons, founded on the experiments in this memoir, which disprove the theory of AGUILONIUS, induce me to reject the law of corresponding points as an accurate expression of the phenomena of single vision. According to the former, objects can appear single only in the plane of the horopter; according to the latter, only when they are in the circle of single vision ; both positions are inconsistent with the binocular vision of objects in relief, the points of which they consist appealing single though they are at different distances before the eyes. I have already proved -that the assumption made by all the maintainers of the theory of corresponding points, namely that the two pictures projected by any object in the retinue are exactly similar, is quite contrary to fact in every case except that in which the optic axes are parallel.
GASSENDUS, PORTA, TACQUET and GALL maintained, that we see with only one eye at a time though both remain open, one according to them being relaxed and inattentive to objects while the other is upon the stretch. It is a sufficient refutation of this hypothesis, that we see an object double when one of the optic axes is displaced either by squinting or by pressure on the eye-ball with the finger; if we saw with only one eye, one object only should under such circumstances be seen. Again, in many cases which I have already explained, the simultaneous affection of the two retinae excites a different idea in the mind to that consequent on either of the single impressions, the latter giving rise to the idea of a representation on a plane surface,
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 391
the former to that of an object in relief; these things could not occur did we see with only one eye at a time.
Du ToUR* held that though we might occasionally see at the same time with both eyes, yet the mind cannot be affected simultaneously by two corresponding points of the two images. He was led to this opinion by the curious facts alluded to in ? 14. It would be difficult to disprove this conjecture by experiment; but all that the experiments adduced in its favour, and others relating to the disappearance of objects to one eye really proves, is, that the mind is inattentive to impressions made on one retina when it cannot combine the impressions on the two retinae together so as to resemble the perception of some external objects; but they afford no ground whatever for supposing that the mind cannot under any circumstances attend to impressions made simultaneously on points of the two retinae, when they harmonize with each other in suggesting to the mind the same idea.
A perfectly original theory has been recently advanced by M. LEHOT+, who has endeavoured to prove, that instead of pictures on the retinae, images of three dimensions are formed in the vitreous humour which we perceive by means of nervous fila-
ments extended thence from the retina. This theory would account for the single appearance to both eyes of objects in relief, but it would be quite insufficient to explain why we perceive an object of three dimensions when two pictures of it are presented to the eyes; according to it, also, no difference should be perceived in the relief of objects when seen by one or both eyes, which is contrary to what really happens. The proofs, besides, that we perceive external objects by means of pictures on the retinae are so numerous and convincing, that a contrary conjecture cannot be entertained for a moment. On this account it will suffice merely to mention two other theories which place the seat of vision in the vitreous humour. VALLEt+, without denying the existence of pictures on the retina, has advocated that we see the relief
of objects by means of anterior foci on the hyaloid membrane; and RASPAIL?
developed at considerable length the strange hypothesis, that images are neither formed in the vitreous humour nor painted on the retina, but are immediately perceived at the focus of the lenticular system of which the eye is formed.
? 16.
It now remains to examine why two dissimilar pictures projected on the two retinwe give rise to the perception of. an object in relief. I will not attempt at present to give the complete solution of this question, which is far from being so easy as at a first glance it may appear to be, and is indeed one of great complexity. I shall in this place merely consider the most obvious explanations which might be offered, and show their insufficiency to explain the whole of the phenomena.
* Act. Par. 1743. M. p. 334. t Nouvelle Thdorie de la Vision, Par. 1823.
+ Trait6 de la Science du Dessein, Par. 1821, p. 270. ? Nouveau Systbme de Chimie Organique, t. 2. p. 329.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
392 MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION.
It may be supposed, that we see but one point of a field of view distinctly at the same instant, the one namely to which the optic axes are directed, while all other poi are seen so indistinctly, that the mind does not recognize themn to be either single double, and that the figure is appreciated by successively directing the point of convergence of the optic axes successively to a sufficient number of its points to enable us to judge accurately of its form.
That there is a degree of indistinctness in those parts of the field of view to which the eyes are not immediately directed, and which increases with the distance from that point, cannot be doubted, and it is also true that the objects thus obscurely seen are frequently doubled. In ordinary vision, it may be said, this indistinctness and duplicity is not attended to, because the eyes shifting continually fromn point to point, every part of the object is successively rendered distinct; and the perception of the object is not the consequence of a single glance, during which only a small part of it is seen distinctly; but is formed from, a comparison of all the pictures successively seen while the eyes were changing from one point of the object to another.
All this is in some degree true; but were it entirely so, no appearance of relief should present itself when the eyes remain intently fixed on one point of a binocular image in the stereoscope. But on performing the experiment carefully, it will be found, provided the pictures do not extend too far beyond the centres of distinct
vision, that the image is still seen single and in relief wihen this condition is fulf Were the theory of corresponding points true, the appearance should be that of the superposition of the two drawings, to which however it has not the slightest similitude. The following experiments are equally decisive against this theory.
Exp. 1. Draw two lines about two inches long and inclined towards each other, as in fig. 10., on a sheet of paper, and having caused them to coincide by converging the optic axes to a point nearer than the paper, look intently on the upper end of the resultant. line, without allowing the eyes to wander from it for a moment. The entire line will appear single and in its proper relief, and a pin or a piece of straight wire may without the least difficulty be made to coincide exactly in position with it; o1r, if while the optic axes continue to be directed to the upper and nearer end, the point of a pin be made to coincide with the lower and further end or with any intermediate point of the resultant line, the coincidence will remain exactly the same when the optic axes are moved and meet there. The eyes sometimes become fatigued, which causes the line to appear double at those parts to which the optic axes are not fixed, but in such case all appearance of relief vanishes. The same experiment may be tried with more complex figures, but the pictures should not extend too far beyond the centres of the retina.
Another and a beautiful proof that the appearance of relief in binocular vision is an effect independent of the motions of the eyes, may be obtained by impressing an the retinve ocular spectra of the component figures. For this purpose the drawings should be formed of broad coloured lines on a ground of the complementary colour,
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
MR. WHEATSTONE ON THE PHYSIOLOGY OF VISION. 393
for instance red lines on a green ground, and be viewed eithe the apparatus, fig. 6., as the ordinary figures are, taking care however to fix the eyes only to a single point of the compound figure; the drawings must be strongly illuminated, and after a sufficient time has elapsed to impress the spectra on the retinae, the eyes must be carefully covered to exclude all external light. A spectrum of the object in relief will then appear before the closed eyes. It is well known, that a spectrum impressed on a single eye and seen in the dark, frequently alternately appears and disappears: these alternations do not correspond in the spectra impressed on the two retinae, and hence a curious effect arises; sometimes the right eye spectrum will be seen alone, sometimes that of the left eye, and at those moments when the two appear together, the binocular spectrum will present itself in bold relief. As in this case the pictures cannot shift their places on the retinu in whatever manner the eyes be moved about, the optic axes can during the experiment only correspond with a single point of each.
When an object, or a part of an object, thus appears in relief while the optic axes are directed to a single binocular point, it is easy to see that each point of the figure that appears single is seen at the intersection of the two lines of visible direction in which it is seen by each eye separately, whether these lines of visible direction terminate at corresponding points of the two retinae or not.
But if we were to infer the converse of this, viz. that every point of an object in relief is seen by a single glance at the intersection of the lines of visible direction in which it is seen by each eye singly, we should be in error. On this supposition, objects before or beyond the intersection of the optic axes should never appear double, and we have abundant evidence that they do. The determination of the points which shall appear single seems to depend in no small degree on previous knowledge of the form we are regarding. No doubt, some law or rule of vision may be discovered which shall include all the circumstances under which single vision by means of non-corresponding points occurs and is limited. I have made numerous experiments for the purpose of attaining this end, and have ascertained some of the conditions on which single and double vision depend, the consideration of which however must at present be deferred.
Sufficient, however, has been shown to prove that the laws of binocular visible direction hitherto laid down are too restricted to be true. The law of AGUILONIUS assumes that objects in the plane of the horopter are alone seen single; and the law of corresponding points carried to its necessary consequences, though these consequences were unforeseen by its first advocates, many of whom thought that it was consistent with the law of AGUILONIUS, leads to the conclusion, that no object appears single unless it is seen in a circle passing through the centres of visible direction in each eye and the point of convergence of the optic axes. Both of these are inconsistent with the single vision of objects whose points lie out of the plane in one case and the circle in the other; and that objects (do appear single uinder circumstances
MDCCCXXXVIII. 3 E
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms
394 MR. WLHEATSTONE ON THE PHYSIOLOGY OF VISION.
that cannot be explained by these laws, has, I think, been placed beyond doubt by the experiments I have brought forward. Should it be hereafter proved, that all points in the plane or in the circle above mentioned are seen single, and from the great indistinctness of lateral images it will be difficult to give this proof, the law must be qualified by the admission, that points out of them do not always appear double.
This content downloaded from 130.75.137.124 on Mon, 24 Jul 2023 14:34:16 +00:00
All use subject to https://about.jstor.org/terms

View File

@@ -0,0 +1,13 @@
Title: Contributions to the Physiology of Vision.--Part the First. On Some Remarkable, and Hitherto Unobserved, Phenomena of Binocular Vision
Creator: page2pdf-2.1
Producer: iText® 5.5.8 ©2000-2015 iText Group NV (AGPL-version); modified using iText® 7.1.3 ©2000-2018 iText Group NV (JSTOR Michigan; licensed version)
CreationDate: 08/08/16 22:36:32
ModDate: 09/12/20 06:05:41
Tagged: yes
Form: none
Pages: 27
Encrypted: no
Page size: 595 x 882 pts (rotated 0 degrees)
File size: 5988306 bytes
Optimized: no
PDF version: 1.7

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
Creator: TeX
Producer: pdfTeX-1.40.10
CreationDate: 09/13/10 12:49:13
ModDate: 09/13/10 16:57:18
Tagged: no
Form: none
Pages: 12
Encrypted: no
Page size: 612 x 792 pts (letter) (rotated 0 degrees)
File size: 365741 bytes
Optimized: no
PDF version: 1.4

View File

@@ -0,0 +1,221 @@
EU Artificial Intelligence Act
The Act
The AI Act Explorer
Official Documents
Implementation
Implementation Timeline
Compliance Checker
Small Businesses Guide
National Implementation Plans
Codes of Practice
The AI Office
Standard Setting
Assessment
Tasks: AI Office
Tasks: EU Member States
Context
Analyses
Documents
Institutional Context
Historic Timeline
About us
Newsletter
EN
FR
DE
ES
High-level summary of the AI Act
27 Feb, 2024
Updated on 30 May in accordance with the Corrigendum version of the AI Act.
In this article we provide you with a high-level summary of the AI Act, selecting the parts which are most likely to be relevant to you regardless of who you are. We provide links to the original document where relevant so that you can always reference the Act text.
To explore the full text of the AI Act yourself, use our AI Act Explorer . Alternatively, if you want to know which parts of the text are most relevant to you, use our Compliance Checker .
View this content as a PDF
Four-point summary
The AI Act classifies AI according to its risk:
Unacceptable risk is prohibited (e.g. social scoring systems and manipulative AI).
Most of the text addresses high-risk AI systems, which are regulated.
A smaller section handles limited risk AI systems, subject to lighter transparency obligations: developers and deployers must ensure that end-users are aware that they are interacting with AI (chatbots and deepfakes).
Minimal risk is unregulated (including the majority of AI applications currently available on the EU single market, such as AI enabled video games and spam filters at least in 2021; this is changing with generative AI).
The majority of obligations fall on providers (developers) of high-risk AI systems.
Those that intend to place on the market or put into service high-risk AI systems in the EU, regardless of whether they are based in the EU or a third country.
And also third country providers where the high risk AI systems output is used in the EU.
Users are natural or legal persons that deploy an AI system in a professional capacity , not affected end-users.
Users (deployers) of high-risk AI systems have some obligations, though less than providers (developers).
This applies to users located in the EU, and third country users where the AI systems output is used in the EU.
General purpose AI (GPAI):
All GPAI model providers must provide technical documentation, instructions for use, comply with the Copyright Directive, and publish a summary about the content used for training.
Free and open licence GPAI model providers only need to comply with copyright and publish the training data summary, unless they present a systemic risk.
All providers of GPAI models that present a systemic risk open or closed must also conduct model evaluations, adversarial testing, track and report serious incidents and ensure cybersecurity protections.
Prohibited AI systems ( Chapter II , Art. 5 )
The following types of AI system are Prohibited according to the AI Act.
AI systems:
deploying subliminal, manipulative, or deceptive techniques to distort behaviour and impair informed decision-making, causing significant harm.
exploiting vulnerabilities related to age, disability, or socio-economic circumstances to distort behaviour, causing significant harm.
biometric categorisation systems inferring sensitive attributes (race, political opinions, trade union membership, religious or philosophical beliefs, sex life, or sexual orientation), except labelling or filtering of lawfully acquired biometric datasets or when law enforcement categorises biometric data.
social scoring , i.e., evaluating or classifying individuals or groups based on social behaviour or personal traits, causing detrimental or unfavourable treatment of those people.
assessing the risk of an individual committing criminal offenses solely based on profiling or personality traits, except when used to augment human assessments based on objective, verifiable facts directly linked to criminal activity.
compiling facial recognition databases by untargeted scraping of facial images from the internet or CCTV footage.
inferring emotions in workplaces or educational institutions , except for medical or safety reasons.
real-time remote biometric identification (RBI) in publicly accessible spaces for law enforcement , except when:
searching for missing persons, abduction victims, and people who have been human trafficked or sexually exploited;
preventing substantial and imminent threat to life, or foreseeable terrorist attack; or
identifying suspects in serious crimes (e.g., murder, rape, armed robbery, narcotic and illegal weapons trafficking, organised crime, and environmental crime, etc.).
Notes on remote biometric identification:
Using AI-enabled real-time RBI is only allowed when not using the tool would cause considerable harm and must account for affected persons rights and freedoms.
Before deployment, police must complete a fundamental rights impact assessment and register the system in the EU database , though, in duly justified cases of urgency, deployment can commence without registration, provided that it is registered later without undue delay.
Before deployment, they also must obtain authorisation from a judicial authority or independent administrative authority [ 1 ], though, in duly justified cases of urgency, deployment can commence without authorisation, provided that authorisation is requested within 24 hours. If authorisation is rejected, deployment must cease immediately, deleting all data, results, and outputs.
↲ [1] Independent administrative authorities may be subject to greater political influence than judicial authorities ( Hacker, 2024 ).
High risk AI systems ( Chapter III )
Some AI systems are considered High risk under the AI Act. Providers of those systems will be subject to additional requirements.
Classification rules for high-risk AI systems ( Art. 6 )
High risk AI systems are those:
used as a safety component or a product covered by EU laws in Annex I AND required to undergo a third-party conformity assessment under those Annex I laws; OR
those under Annex III use cases (below), except if:
the AI system performs a narrow procedural task;
improves the result of a previously completed human activity;
detects decision-making patterns or deviations from prior decision-making patterns and is not meant to replace or influence the previously completed human assessment without proper human review; or
performs a preparatory task to an assessment relevant for the purpose of the use cases listed in Annex III.
AI systems are always considered high-risk if it profiles individuals, i.e. automated processing of personal
data to assess various aspects of a persons life, such as work performance, economic situation, health,
preferences, interests, reliability, behaviour, location or movement.
Providers whose AI system falls under the use cases in Annex III but believes it is not high-risk must document such an
assessment before placing it on the market or putting it into service.
Requirements for providers of high-risk AI systems (Art. 8 17 )
High risk AI providers must:
Establish a risk management system throughout the high risk AI systems lifecycle;
Conduct data governance , ensuring that training, validation and testing datasets are relevant, sufficiently representative and, to the best extent possible, free of errors and complete according to the intended purpose.
Draw up technical documentation to demonstrate compliance and provide authorities with the information to assess that compliance.
Design their high risk AI system for record-keeping to enable it to automatically record events relevant for identifying national level risks and substantial modifications throughout the systems lifecycle.
Provide instructions for use to downstream deployers to enable the latters compliance.
Design their high risk AI system to allow deployers to implement human oversight .
Design their high risk AI system to achieve appropriate levels of accuracy, robustness, and cybersecurity .
Establish a quality management system to ensure compliance.
Annex III use cases
Non-banned biometrics: Remote biometric identification systems, excluding biometric verification that confirm a person is who they claim to be. Biometric categorisation systems inferring sensitive or protected attributes or characteristics. Emotion recognition systems.
Critical infrastructure: Safety components in the management and operation of critical digital infrastructure, road traffic and the supply of water, gas, heating and electricity.
Education and vocational training: AI systems determining access, admission or assignment to educational and vocational training institutions at all levels. Evaluating learning outcomes, including those used to steer the students learning process. Assessing the appropriate level of education for an individual. Monitoring and detecting prohibited student behaviour during tests.
Employment, workers management and access to self-employment: AI systems used for recruitment or selection, particularly targeted job ads, analysing and filtering applications, and evaluating candidates. Promotion and termination of contracts, allocating tasks based on personality traits or characteristics and behaviour, and monitoring and evaluating performance.
Access to and enjoyment of essential public and private services: AI systems used by public authorities for assessing eligibility to benefits and services, including their allocation, reduction, revocation, or recovery. Evaluating creditworthiness, except when detecting financial fraud. Evaluating and classifying emergency calls, including dispatch prioritising of police, firefighters, medical aid and urgent patient triage services. Risk assessments and pricing in health and life insurance.
Law enforcement: AI systems used to assess an individuals risk of becoming a crime victim. Polygraphs. Evaluating evidence reliability during criminal investigations or prosecutions. Assessing an individuals risk of offending or re-offending not solely based on profiling or assessing personality traits or past criminal behaviour. Profiling during criminal detections, investigations or prosecutions.
Migration, asylum and border control management: Polygraphs. Assessments of irregular migration or health risks. Examination of applications for asylum, visa and residence permits, and associated complaints related to eligibility. Detecting, recognising or identifying individuals, except verifying travel documents.
Administration of justice and democratic processes: AI systems used in researching and interpreting facts and applying the law to concrete facts or used in alternative dispute resolution. Influencing elections and referenda outcomes or voting behaviour, excluding outputs that do not directly interact with people, like tools used to organise, optimise and structure political campaigns.
General purpose AI (GPAI)
GPAI model means an AI model, including when trained with a large amount of data using self-supervision at scale, that displays significant generality and is capable to competently perform a wide range of distinct tasks regardless of the way the model is placed on the market and that can be integrated into a variety of downstream systems or applications. This does not cover AI models that are used before release on the market for research, development and prototyping activities.
GPAI system means an AI system which is based on a general purpose AI model, that has the capability to serve a variety of purposes, both for direct use as well as for integration in other AI systems.
GPAI systems may be used as high risk AI systems or integrated into them. GPAI system providers should cooperate with such high risk AI system providers to enable the latters compliance.
All providers of GPAI models must:
Draw up technical documentation , including training and testing process and evaluation results.
Draw up information and documentation to supply to downstream providers that intend to integrate the GPAI model into their own AI system in order that the latter understands capabilities and limitations and is enabled to comply.
Establish a policy to respect the Copyright Directive .
Publish a sufficiently detailed summary about the content used for training the GPAI model.
Free and open licence GPAI models whose parameters, including weights, model architecture and model usage are publicly available, allowing for access, usage, modification and distribution of the model only have to comply with the latter two obligations above, unless the free and open licence GPAI model is systemic.
GPAI models present systemic risks when the cumulative amount of compute used for its training is greater than 10 25 floating point operations (FLOPs) . Providers must notify the Commission if their model meets this criterion within 2 weeks. The provider may present arguments that, despite meeting the criteria, their model does not present systemic risks. The Commission may decide on its own, or via a qualified alert from the scientific panel of independent experts, that a model has high impact capabilities, rendering it systemic.
In addition to the four obligations above, providers of GPAI models with systemic risk must also:
Perform model evaluations , including conducting and documenting adversarial testing to identify and mitigate systemic risk.
Assess and mitigate possible systemic risks , including their sources.
Track, document and report serious incidents and possible corrective measures to the AI Office and relevant national competent authorities without undue delay.
Ensure an adequate level of cybersecurity protection .
All GPAI model providers may demonstrate compliance with their obligations if they voluntarily adhere to a code of practice until European harmonised standards are published, compliance with which will lead to a presumption of conformity. Providers that dont adhere to codes of practice must demonstrate alternative adequate means of compliance for Commission approval.
Codes of practice
Will account for international approaches.
Will cover but not necessarily limited to the above obligations, particularly the relevant information to include in technical documentation for authorities and downstream providers, identification of the type and nature of systemic risks and their sources, and the modalities of risk management accounting for specific challenges in addressing risks due to the way they may emerge and materialise throughout the value chain.
AI Office may invite GPAI model providers, relevant national competent authorities to participate in drawing up the codes, while civil society, industry, academia, downstream providers and independent experts may support the process.
Governance
How will the AI Act be implemented?
The AI Office will be established, sitting within the Commission, to monitor the effective implementation and compliance of GPAI model providers.
Downstream providers can lodge a complaint regarding the upstream providers infringement to the AI Office.
The AI Office may conduct evaluations of the GPAI model to:
assess compliance where the information gathered under its powers to request information is insufficient.
Investigate systemic risks, particularly following a qualified report from the scientific panel of independent experts.
Timelines
After entry into force, the AI Act will apply by the following deadlines:
6 months for prohibited AI systems.
12 months for GPAI.
24 months for high risk AI systems under Annex III.
36 months for high risk AI systems under Annex I.
Codes of practice must be ready 9 months after entry into force.
See our full implementation timeline for all key milestones relating to the implementation of the AI Act.
This post was published on 27 Feb, 2024
Related articles
Small Businesses Guide to the AI Act
19 Feb, 2025
Everything you need to know about the AI Act, for small and medium-sized enterprises (SMEs) in the EU and beyond.
Job Opportunities at the European AI Office for Legal and Policy Backgrounds
16 Dec, 2024
The Commission has opened two calls for expression of interest to recruit new members for the European AI Office. Apply now as Legal or Policy Officer for an opportunity to shape trustworthy AI.The deadline for expression of interest is 15 January 2025. The salary for...
The AI Office is hiring a Lead Scientific Advisor for AI
19 Nov, 2024
This opportunity has now passed. A very important job opening has opened up at the European AI Office: They are hiring for the Lead Scientific Advisor for AI. Application deadline is 13 December 2024. Based on the European Union Employment Advisor, the monthly basic...
Overview of all AI Act National Implementation Plans
08 Nov, 2024
This post gives an overview of the national authorities to be designated under the AI Act and what we know about the national implementation plans.
The AI Act: Responsibilities of the European Commission (AI Office)
22 Aug, 2024
If you are unsure who is implementing and enforcing the new digital law and what the specific time frames are, you might find this post—and our post on the responsibilities of the EU Member States—very helpful. The tables below provide a comprehensive list of all...
Receive EU AI Act updates in your inbox every two weeks
Subscribe to receive biweekly up-to-date developments and analyses of the proposed EU AI Act. With over 36,000 subscribers, this newsletter is the go-to resource for EU policymakers on the AI Act.
See the Newsletter
© Future of Life Institute, 2025
This website is maintained by the Future of Life Institute (FLI). Our EU transparency register number is 787064543128-10.

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,863 @@
Use of Ranks in One-Criterion Variance Analysis Author(s): William H. Kruskal and W. Allen Wallis Source: Journal of the American Statistical Association , Dec., 1952, Vol. 47, No. 260 (Dec., 1952), pp. 583-621 Published by: Taylor & Francis, Ltd. on behalf of the American Statistical Association Stable URL: https://www.jstor.org/stable/2280779
JSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted digital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about JSTOR, please contact support@jstor.org. Your use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at https://about.jstor.org/terms
Taylor & Francis, Ltd. and American Statistical Association are collaborating with JSTOR to digitize, preserve and extend access to Journal of the American Statistical Association
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
JOURNAL OF THE AMERICAN
STATISTICAL ASSOCIATION
Number 260 DECEMBER 1952 Volume 47
USE OF RANKS IN ONE-CRITERION VARIANCE
ANALYSIS
WILLIAM H. KRUSKAL AND W. ALLEN WALLIS
University of Chicago
1. INTRODUCTION ................................................. 584 1.1 Problem ...................... ............................ 584 1.2 Usual Solution ............................................. 584 1.3 Advantages of Ranks ....................... 585 1.4 The H Test ........................... 586
2. EXAMPLES ..................................................... 587 2.1 Without Ties .......................... 587 2.2 With Ties ................................................. 588
3. JUSTIFICATION OF THE METHOD .. ............................... . 590 3.1 Two samples .......................... 590 3.1.1 Continuity adjustment ............................... 591 3.1.2 Ties ................................................ 592 3.1.3 Examples ........... ................................ 593 3.2 Three Samples ............................................. 595 3.3 More than Three Samples .................................... 597
4. INTERPRETATION OF THE TEST . . . 598
4.1 General Considerations ..................................... 598 4.2 Comparison of Means when Variability Differs . ........... 598 5. RELATED TESTS . ... 600
5.1 Permutation Tests and Ranks ...................... 600
5.2 Friedman's Xr2 .......................... 601 5.3 Wilcoxon's Two-Sample Test ................... 602
5.3.1 Wilcoxon (1945, 1947) .602 5.3.2 Festinger (1946) .602 5.3.3 Mann and Whitney (1947) .603 5.3.4 Haldane and Smith (1948) .603 5.3.5 White (1952) .604 5.3.6 Power of Wilcoxon's test ............................. 604 5.4 Whitney's Three-Sample Test ................... 605 5.5 Terpstra's C-Sample Test ........................ 606 5.6 Mosteller's C-Sample Test .................... 606 5.7 Fisher and Yates' Normalized Ranks . ........................ 606 5.8 Other Related Tests ........................................ 607 5.8.1 Runs ................ ............................... 607 5.8.2 Order statistics ......................................6 07 6. SIGNIFICANCE LEVELS, TRUE AND APPROXIMATE . . . 608 6.1 True Significance Levels ...................... 608 6.1.1 Two samples ........................................ 608 6.1.2 Three samples ....................................... 608 6.1.3 More than three samples ............................. 608 6.2 Approximate Significance Levels . ................. 609 6.2.1 x2 approximation .................................... 609 6.2.2 r approximation ........ ........................... 609 6.2.3 B approximation .................................... 609 6.3 Comparisons of True and Approximate Significance Levels ...... 618 7. REFERENCES ................................................... 618
583
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
584 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
Given C samples, with ni observations in the ith sample, a test of the hypothesis that the samples are from the same population may be made by ranking the observations from from 1 to Eni (giving each observation in a group of ties the mean of the ranks tied for), finding the C sums of ranks, and computing a statistic H. Under the stated hypothesis, H is distributed approximately as X2(C-1), unless the samples are too small, in which case special approximations or exact tables are provided. One of the most important applications of the test is in detecting differences among the population
means.*
1.1. Problem
1. INTRODUCTION
A COMMON problem in practical statistics is to deride whether several samples should be regarded as coming from the same
population. Almost invariably the samples differ, and the question is whether the differences signify differences among the populations, or are merely the chance variations to be expected among random samples from the same population. When this problem arises one may often assume that the populations are of approximately the same form, in the sense that if they differ it is by a shift or translation.
1.2. Usual Solution
The usual technique for attacking such problems is the analysis of variance with a single criterion of classification [46, Chap. 10]. The variation among the sample means, xi, is used to estimate the variation among individuals, on the basis of (i) the assumption that the variation among the means reflects only random sampling from a population in which individuals vary, and (ii) the fact that the variance of the
means of random samples of size ni is o-2/ni where o-2 is the population variance. This estimate of o-2 based on the variation among sample means is then compared with another estimate based only on the varia-
* Based in part on research supported by the Office of Naval Research at the Statistical Research Center, University of Chicago.
For criticisms of a preliminary draft which have led to a number of improvements we are indebted to Maurice H. Belz (University of Melbourne), William G. Cochran (Johns Hopkins University), J. Durbin (London School of Economics), Churchill Eisenhart (Bureau of Standards), Wassily Hoeffding (University of North Carolina), Harold Hotelling (University of North Carolina), Howard L. Jones (Illinois Bell Telephone Company), Erich L. Lehmann (University of California), William G. Madow (University of Illinois), Henry B. Mann (Ohio State University), Alexander M. Mood (The Rand Corporation), Lincoln E. Moses (Stanford University), Frederick Mosteller (Harvard University), David L. Russell (Bowdoin College), I. Richard Savage (Bureau of Standards), Frederick F. Stephan (Princeton University), Alan Stuart (London School of Economics), T. J. Terpstra (Mathematical Center, Amsterdam), John W. Tukey (Princeton University), Frank Wilcoxon (American Cyanamnid Company), and C. Ashley Wright (Standard Oil Company of New Jersey), and to our colleagues K. A. Brownlee, Herbert T. David, Milton Friedman, Leo A. Goodman, Ulf Grenander, Joseph L. Hodges. Harry V. Roberts, Murray Rosenblatt, Leonard J. Savage, and Charles M. Stein.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 585
tion within samples. The agreement between these two estimates is tested by the variance ratio distribution with C -1 and N - C degrees of freedom (where N is the number of observations in all C samples combined), using the test statistic F(C- 1, N- C). A value of F larger than would ordinarily result from two independent sample estimates of a single population variance is regarded as contradicting the hypothesis that the variation among the sample means is due solely to random sampling from a population whose individuals vary.
When o-2 is known, it is used in place of the estimate based on the variation within samples, and the test is based on the X2(C- 1) distribution (that is, x2 with C-1 degrees of freedom) using the test statistic
(1.1) X2(C- 1) =2E
where 9 is the mean of all N observations.
1.3. Advantages of Ranks
Sometimes it is advantageous in statistical analysis to use ranks instead of the original observations-that is, to array the N observations in order of magnitude and replace the smallest by 1, the next-tosmallest by 2, and so on, the largest being replaced by N. The advantages are:
(1) The calculations are simplified. Most of the labor when using ranks is in making the ranking itself, and short cuts can be devised for this. For example, class intervals can be set up as for a frequency distribution, and actual observations entered instead of tally marks. Another method is to record the observations on cards or plastic chips' which can be arranged in order, the cards perhaps by sorting devices.
(2) Only very general assumptions are made about the kind of distributions from which the observations come. The only assumptions underlying the use of ranks made in this paper are that the observations are all independent, that all those within a given sample come from a single population, and that the C populations are of approximately the same form. The F and x2 tests described in the preceding section assume approximate normality in addition.
(3) Data available only in ordinal form may often be used. (4) When the assumptions of the usual test procedure are too far from reality,
not only is there a problem of distribution theory if the usual test is used, but it is possible that the usual test may not have as good a chance as a rank test of detecting the kinds of difference of real interest.
The present paper presents an analog, based on ranks and called the H test, to one-criterion variance analysis.
'We are indebted to Frank Wilcoxon for this suggestion.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
586 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
1.4. The H Test The rank test presented here requires that all the observations be
ranked together, and the sum of the ranks obtained for each sample. The test statistic to be computed if there are no ties (that is, if no two observations are equal) is
12 c R2
(1.2) HN= + 1)_-3(N + 1) (no ties)
N(N + 1) i==1 ni
where C=the number of samples,
ni=the number of observations in the ith sample, N=Yni, the number of observations in all samples combined, Ri =the sum of the ranks in the ith sample.
Large values of H lead to rejection of the null hypothesis. If the samples come from identical continuous populations and the
ns are not too small, H is distributed as x2(C- 1), permitting use of the readily available tables of X2. When the ni are small and C=2, tables are available which are described in Section 5.3. For C = 3 and all ni< 5, tables are presented in Section 6. For other cases where th approximation is not adequate, two special approximations are described in Section 6.2.
If there are ties, each observation is given the mean of the ranks for which it is tied. H as computed from (1.2) is then divided by
(1.3) 1- >2T
N 3- N
where the summation is over all groups of ties andT=(t-1)t(t+1) =t3-t for each group of ties, t being the number of tied observations in the group. Values of T for t up to 10 are shown in Table 1.1.2
TABLE 1.1 (See Section 3.1.2)
t 1 2 3 4 5 6 7 8 9 10 T 0 6 24 60 120 210 336 504 720 990
Since (1.3) must lie between zero and one, it increases (1.2). If all N observations are equal, (1.3) reduces (1.2) to the indeterminate form
0/0. If there are no ties, each value of t is 1 so ET=0 and (1.2) is
2 DuBois [4, Table nI gives values of T/12 (his C1) and T/6 (his ce) for t (his N) from 5 to 50.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 587
unaltered by (1.3). Thus, (1.2) divided by (1.3) gives a general expression which holds whether or not there are ties, assuming that such ties as occur are given mean ranks:
12 c R 2
(1.4) H _ N(N + 1) i=i ni
1 - E T/(N3 - N)
In many situations the difference between (1.4) and (1.2) is negligible. A working guide is that with ten or fewer samples a x2 probability of 0.01 or more obtained from (1.2) will not be changed by more than ten per cent by using (1.4), provided that not more than one-fourth of the observations are involved in ties.3 H for large samples is still distributed as X2(C- 1) when ties are handled by mean ranks; but the tables for small samples, while still useful, are no longer exact.
For understanding the nature of H, a better formulation of (1.2) is
N - 1 c ni[Ri - l(N + 1)12 (1.5) H = E (no ties)
N il1 (N 2 - 1)/12
where Ri is the mean of the ni ranks in the ith sample factor (N-1)/N, and note that !(N+1) is the mean and 1(N2-1) the variance of the uniform distribution over the first N integers, we see that (1.5), like (1.1), is essentially a sum of squaredstandardized deviations of random variables from their population mean. Inthis respect, H is similar to X2, which is defined as a sum of squares of standardized normal deviates, subject to certain conditions on the relations among the terms of the sum. If the ni are not too small, the Xi jointly will be approximately normally distributed and the relations among them will meet the x2 conditions.
2.1 Without Ties
2. EXAMPLES
In a factory, three machines turn out large numbers of bottle caps. One machine is standard and two have been modified in different ways, but otherwise the machines and their operating conditions are identical. On any one day, only one machine is operated. Table 2.1
3 Actually, for the case described it is possible for the discrepancy slightly to exceed ten per cent. For a given total number of ties, S, the second term of (1.3) is a maximum if all S ties are in one group and this maxiInum, (83 -) I/(N3 -N), is slightly less than (S/N)3. Thus, for S/N = -, (1.3) >63/64 The 0.01 level of x2(9) is 21.666. This divided by 63/64 is 22.010, for which the probability is 0.00885, a change of 114 per cent. For higher probability levels, fewer samples, or more than one group of tie the percentage change in probability would be less. With the S ties divided into G groups, the second term of (1.3) is always less than [(S-h)3 +4h1 /N3, where h =2(G-1).
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
588 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
shows the production of the machines on various days lation of H as 5.656. The true probability, if the machines really are the same with respect to output, that H should be as large as 5.656 is shown in Figure 6.1 and Table 6.1 as 0.049. The approximation to this probability given by the x2(2) distribution is 0.059. Two morecomplicated approximations described in Section 6.2 give 0.044 and
0.045.
TABLE 2.1
DAILY BOTTLE-CAP PRODUCTION OF THREE MACHINES. (Artificial data.)
Standard Modification 1 Modification 2
Output Rank Output Rank Output Rank
340 5 339 4 347 10 345 9 333 2 343 7 330 1 344 8 349 11
342 6 355 12 338 3
n
5
3
R 24 14
R2/n 115.2 65.333
Sum
4
12
40 78
400. 580.533
Checks : En =N=12 >ZR= JN(N +1) = 78
H 1212XX5183 0-533 X13=5.656 2(2) from (1.2)
Pr[x2(2) 25.656] =0.059 from [9] or [13] Pr[H(5, 4, 3) _5.656] =0.049 from Table 6.1
If the production data of Table 2.1 are compared by the conventional analysis of variance, F(2, 9) is 4.2284, corresponding to a probability of 0.051.
2.2 With Ties
Snedecor's data on the birth weight of pigs [46, Table 10.12] ar shown in Table 2.2, together with the calculation of H adjusted for
the mean-rank method of handling ties. Here H as adjusted4 is 18.566. The true probability in this case would be difficult to find,' but the
4Note that, as will often be true in practice, the adjustment is not worth the trouble even in this case: by changing H from 18.464 to 18.566, it changed the probability by only 0.0003, or 3 per cent. Since there are 47 ties in 13 groups, we see from the last sentence of note 3 that (1.3) cannot be less than 1-(233 +96) /563, which is 0.9302.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 589
o ~ ~~~~ CZC
X<5 ce
o0 Gr<, t
W mX
cq
C,
ce
Pv ; < ~~CO CS Cq. CSC 00 Ct C h ~ ~~~~~ W c ooN
E.>.vo ;0 B ecs C nn 1~~~~~~cor-
C;l = H ? $ e X ?? O < o g
;~~~~~~~C tO O' 0 NGGN .- to ? ? r:;> _ X ,zlew Hlew-bSew view X X + 0-- 1194 Cm~~~~~q cq can c> 1sc~T m Q ; < cs ce es O ce 23s oo s Q 1e U X oH_ oH<o~~v~e~w ~~S~-~~N~4 cl CO r- -r4r- - < H~~~~~~~~t ~~~o t X ~~~~~~~~~~~~~~~~~~~~~~~~~r- 11 + C
M CO C.0 -4ee Ce Co CrO vJ Cq C
CS U- Cq d4 00 iC C 1 O
< $ s o s ~~~~~~~~~~~~~~-> t 1 +M
to00C U- os Cocsno od CO X0 S- to C;
Ca ~ ~ Cq t 10 11 , C
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
590 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
x 2(7) approximation gives a probability of 0.010
mations described in Section 6.2 give 0.006 and 0.005.
The conventional analysis of variance [46, Sec. 10.8] gives F(7, 48
= 2.987, corresponding with a probability of 0.011.
3. JUSTIFICATION OF THE METHOD
3.1. Two Samples
The rationale of the H test can be seen most easily by considering the case of only two samples, of sizes n and N-n. As is explained in Section 5.3, the H test for two samples is essentially the same as a test- published by Wilcoxon [61] in 1945 and later by others.
In this case, we consider either one of the two samples, presumably the smaller for simplicity, and denote its size by n and its sum of ranks by R. We ask whether the mean rank of this sample is larger (or smaller) than would be expected if n of the integers 1 through N were selected at random without replacement.
The sum of the first N integers is jN(N+ 1) and the sum of their squares is 1N(N+ 1) (2N+ 1). It follows that the mean and variance of
the first N integers are 1(N+1) and -&?(N2-1).
The means of samples of n drawn at random without replacement from the N integers will be normally distributed to an approximation close enough for practical purposes, provided that n and N-n are not too small. The mean of a distribution of sample means is, of course, the mean of the original distribution; and the variance of a distribution of sample means is (o-2/n)[(N-n)/(N-1)], where o2 is the population variance, N is the population size, and n is the sample size. In this case,
T-2=1 (N2- so
(3.1) _ (N2-1)(N-n) (N + 1) (N-n)
12n(NV - 1) 12n
where a'2 represents the variance of the mean of n numbers drawn at random without replacement from N consecutive integers. Letting R denote the mean rank for a sample of n,
(3.2) V/(N + 1)(N - n)/12n
may be regarded as approximately a unit normal deviate. The square of (3.2) is H as given by (1.2) with5 C= 2, and the square of a unit normal deviate has the x2(l) distribution.
6 This may be verified by replacing R in (3.2) by R/n and letting the two values of Ri in (1.2) be R and 'N(N+1) -R, with n and N-n the corresponding values of ni.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 591
Notice that this expression is the same, except for si the two samples is used to compute it. For if the first sample contains n ranks whose mean- is X, the other sample must contain N-n ranks whose mean is
()N(N+1) -n7 (3.3) N-n
and the value of (3.2) is changed only in sign if we interchange n and N-n, and replace R by (3.3).
In the two-sample case the normal deviate is perhaps a little simpler to compute than is H; furthermore, the sign of the normal deviate is needed if a one-tail test is required. For computations, formula (3.2) may be rewritten
(3.4)2R nN l 21R- n(N + 1)
()Vn(N + 1)(N -n)/3
The null hypothesis is that the two samples come from the same population. The alternative hypothesis is that the samples come from populations of approximately the same form, but shifted or translated with respect to each other. If we are concerned with the one-sided alternative that the population producing the sample to which R and n relate is shifted upward, then we reject when (3.4) is too large. The critical level of (3.4) at the a level of significance is approximately Ka, the unit normal deviate exceeded with probability a, as defined by
(3.5) e_lx2dX = a.
,\27r aR
Values of (3.4) as large as K, or larger re hypothesis. If the alternative is one-sided but for a downward shift, the null hypothesis is rejected when (3.4) is as small as - Ka or smaller. If the alternative is two-sided and symmetrical, the null hypothesis is rejected if (3.4) falls outside the range - Kia to +Kja.
3.1.1. Continuity adjustment. It seems reasonable to expect that a continuity adjustment may be desirable, to allow for the fact that R, the sum of the ranks in one sample, can take only integral values, whereas the normal distribution is continuous.6 In testing against a two-sided alternative to the null hypothesis, the adjustment is made
5 An extensive comparison of exact probabilities for the two-sample test [28] with those based on the normal approximation indicates that the normal approximation is usually better with the continuity adjustment when the probability is above 0.02, and better without it when the probability is 0.02 or below. This comparison was made for us by Jack Karush, who has also rendered invaluable assistance with numerous other matters in the preparation of this paper.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
592 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
by increasing or decreasing R by I, whichever brings it closer to In(N+ 1), before substituting into (3.4). (If RI=-n(N+ 1), ignore the continuity adjustment.) With a one-sided alternative, R is increased (decreased) by I if the alternative is that the sample for which R is computed comes from the population which is to the left (right) of the other.
3.1.2. Ties. If some of the N observations are equal, we suggest that each member of a group of ties be given the mean of the ranks tied for in that group. This does not affect the mean rank, t(N+1). It does, however, reduce the variance below I (N2- 1). Letting T = (t - 1)t(t+ 1) for each group of ties, where t is the number of tied observations in the
group, and letting ET represent the sum of the values of T for all
groups of ties, we have, instead of (3.1),
(3.6) 2 N(N2-1) - T N-n
Xk 12Nn N - i
as the variance of the mean rank for samples of n. When there are no ties, T=0 and (3.6) reduces to (3.1), so (3.6) may be regarded as the general expression for ufj when the mean-rank method is used for such ties as occur. Notice that (3.6) is the product of (3.1) and (1.3).
This adjustment comes about as follows :7 The variance (N2-1) is obtained by subtracting the square of the mean from the mean of the squares of N consecutive integers. If each of the t integers (x+1) through (x+t) is replaced by x+2(t+1), the sum is not changed but the sum of the squares is reduced by
(3.7) (x + i)2-t(x+ ) 1 -- 12
So the mean of the squares, and consequently the variance, is reduced by T/12N.
The mean-rank method of handling ties somewhat complicates the continuity adjustment, for the possible values of R are no longer simply the consecutive integers ln(n+1) to 'n(2N-n+1), nor need they be symmetrical about 4l(N+1). Our guess, however, is that it is better to make the ? 2 adjustment of Section 3.1.1. than not to make any.
7 This is the adjustment alluded to by Friedman [10, footnote 11]. An equivalent adjustment for mean ranks has been suggested by Hemelrijk [16, formula (6) 1, but in a very complex form. A much simpler version of his formula (6) is obtained by multiplying our (3.6) by n2. The same adjustment has been suggested by Horn [18a].
This adjustment, however, goes back at least as far as a 1921 paper by 'Student' [48al, applying it to the Spearman rank correlation coefficient. For further discussion and other references, see Kendall [20, Chap. 3].
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 593
An alternative method of handling ties is to assign the ranks at random within a group of tied observations. The distribution of H under the null hypothesis is then the same as if there had been no ties, since the null hypothesis is that the ranks are distributed at random. In order to use this method, adequate randomization must be provided with consequent complications in making and verifying computations. Some statisticians argue further that the introduction of extraneous random variability tends to reduce the power of a test. We do not know whether for the H test random ranking of ties gives more or less power than mean ranks; indeed, it may be that the answer varies from one alternative hypothesis to another and from one significance level to another.8 When all members of a group of ties fall within the same sample, every assignment of their ranks gives rise to the same value of H, so that it might be thought artificial in this instance to use mean-ranks; even here, however, an eristic argument can be made for mean ranks, on the ground that H interprets a particular assignment of ranks against the background of all possible assignments of the same ranks to samples of the given sizes, and some of the possible assignments put the ties into different samples.9
3.1.3. Examples. (i) As a first example consider a particularly simple one discussed by Pitman [41].
TABLE 3.1 PITMAN EXAMPLE [41, p. 122]
Sample A Sample B
Observation Rank Observation Rank
0
1
11
2
12
3
20
6
16
4
19
5
22
7
24
8
29 9
n=4, N=9, R=1,2
8 A few computations for simple distributions and small samples, some carried out by Howard L. Jones and some by us, showed mean ranks superior sometimes and random ranks others. For theoretical purposes, random ranking of ties is much easier to handle.For practical purposes, it should be remembered that there will ordinarily be little difference between the two methods; see notes 3 and 4. Computational considerations, therefore, lead us to suggest the mean-rank method.
Ranking of tied observations at random should be distinguished from increasing the power of a test by rejecting or accepting the null hypothesis on the basis of an ancillary random device, in such a way as to attain a nominal significance level which, because of discontinuities, could not otherwise be attained. Discussions of this are given by Eudey [6] and E. S. Pearson [37].
9 This is illustrated in the calculation of the exact probability for the data of Table 3.2.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
594 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
If we use the two-tail H test without adjustment for continuity, we compute the approximate unit-normal deviate from (3.4):
(2 X (212X ) -(412X)1-0()4__16_ - - X 10) = - 1.9596
V/(4 X 10 X 5)/3 V/200/3
corresponding to a two-tail normal probability of 0.0500. If we make the continuity adjustment, we get:
(2 X~~1~~ 2~.5_ )-_ (4 = X -101).3175 ((ccn oinuttinuity V/(4 X 10 X 5)/3 3200/3 adjustment)
corresponding to a two-tail normal probability of 0.0662. Actually, since the samples are so small, it is easy to compute the
true probability under the null hypothesis of a value of R as extreme as, or more extreme than, 12. There are 9!/4!5! or 126 ways of selecting four ranks from among the nine, and all 126 ways are equally probable under the null hypothesis. Only four of the 126 lead to values of R of 12 or less. By symmetry another set of 4 lead to values as extreme but in the opposite direction, that is, n(N+ 1) - f = 28 or more. Hence the true probability to compare with the foregoing approximations is 8/126, or 0.06349. This value can also be obtained from the tables given by Mann and Whitney [28]; they show 0.032 for one tail, and when doubled this agrees, except for rounding, with our calculation.'0
(ii) A second, and more realistic, example will illustrate the kind of
TABLE 3.2 BROWNLEE EXAMPLE [2, p. 361
Method A Method B
Value Rank Value Rank
95.6 91 93.3 4 94.9 7 92.1 3
96.2 12 94.7 51
95.1 8 90.1 2
95.8 11 95.6 91
96.3 13 90.0 1
94.7 5i
R =601, n = 6, N=13
10 Pitman [41 ] gives a test which is likeH except thatit considers possible observations instead of their ranks. For the example of Table 3.1, Pitman's test yields a two-tail probability of 5/126 or 0.03968.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 595
complication that arises in practice. Table 3.2 shows the results of two alternative methods of technical chemical analysis. Since there are ties (two groups of two ties), mean ranks are used.
If we use (3.4) without adjusting either for continuity or for the use of mean ranks, we obtain as our approximate unit-normal deviate
121 - 84 37
_84 X _ 3__ = - = 2.6429 (no adjustments) V\(84 X 7)/3 14
which corresponds to the two-tail normal probability of 0.0082.
If we use the adjustment for mean ranks, we find that ET = 12, s
(3.6) gives oSR= 1.1635 and the denominator of (3.4), which is
(3.8) 2R= 2naw,
is adjusted to 13.9615. This leads to the approximate unit-normal deviate
121 - 84
13.9615 = 2.6501 (adjusted for mean ran
corresponding to a two-tail probability of 0.0080-not appreciably different from the result without the adjustment.
The continuity adjustment is not desirable in this case, since the probability level is appreciably less than 0.02.6 The comments of Section 3.1.2 about irregularities in the sequence of possible values of R also apply. For purely illustrative purposes, however, we note that the effect of the continuity adjustment would be to reduce R from 604 to 60, resulting in an approximate normal. deviate of
120 - 84 - = 2.5785 (adjusted for continuity and mean ranks)
for which the symmetrical two-tail normal probability is 0.0099. The true probability in this case can be computed by considering all
possible sets of six that could be selected from the 13 ranks 1, 2, 3, 4, 5a, 51, 7, 8, 9a, 9g, 11, 12, 13. There are 13!/6!7! or 1716 such sets, all equally probable under the null hypothesis. Six of them give rise to values of R greater than or equal to 60k1, and five give rise to values of R less than or equal to 23k, which is as far below as 60' is above
Wn(N+1). Hence the true probability is 11/1716, or 0.00641.
3.2. Three Samples
When there are three samples, we may consider the average ranks for any two of them, say the ith and jth. The other sample, the kth,
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
596 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
would not tell us anything we cannot find out from two, for its mean rank must be
(39) ?]N(N + 1)-(niR, + n,7i)
N- (ni + n1)
If the n's are not too small, the joint distribution of Xi and X approximately that bivariate normal distribution whose exponent is
(3.10) R
(N+ 1)QN+1) jN + 1Y1
2(Xi_ 2 +
2p
-+
iRJRJ R,
The variances needed in (3.10) are given by (3.1) and the correlation by
(3.11) p = - _
which is the correlation between the means of samples of sizes ni and ni when all ni+ni are drawn at random without replacement from a population of N elements.1' Thus the exponent (3.10) of the bivariate normal distribution which approximates the joint distribution of 7R and Xi is, when multiplied by -2,
12nini rN-n _ N + 1_ 2
(3.12) N(N + 1)(N-ni-ni) L n , - 2k
(3-12) ~~/ N + 1\/ N + 1\
+ 2 Ri - 2 )R; 2 )
+N - ni)(N+ 1)21
+ ihe bRivi n d
It is well known that -2 times the expo
11 Although (3.11) is easily derived and is undoubtedl populations, we have not found it in any of the standar by Neyman [47, p.39] in 1923, and a more general case of ance in trying to locate previous publications of (3.11) w Dalenius (Stockholm), W. Edwards Deming (Bureau of the Budget), P. M. Grundy (Rothamsted Experimental Station) who told us of [38], Morris H. Hansen (Bureau of the Census), Maurice G. Kendall (London School of Economics), Jersy Neyman (University of California) who told us of [47], June H. Roberts (Chicago), Frederick F. Stephan who provided a compact derivation of his own, John W. Tukey, and Frank Yates (Rothamsted Experimental Station).
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 597
tribution has the x2(2) distribution [32, Sec. 10.10]. Hence (3.12) could be taken as our test statistic for the three-sample problem, and approximate probabilities found from the X2 tables.
From the relations
(3.13) niRi + n;R1 + nkRk = iN(N + 1)
and
(3.14) ni + n1 + nk = N
it can be shown that the value of (3.12) will be the same whichever pair of samples is used in it, and that this value will be H as given by (1.2) with C=3. For computing, (1.2) has the advantages of being simpler than (3.12) and of treating all (R, n) pairs alike.
With three or more samples, adjustments for continuity are unimportant except when the ni are so small that special tables of the true distribution should be used anyway.
Since the adjustment for the mean-rank method of handling ties is a correction to the sum of squares of the N ranks, it is the same for three or more groups as for two. The variances given by (3.1) for the case without ties are replaced by (3.6) when there are ties; hence (1.2) with mean ranks should be divided by (1.3) to give H as shown by (1.4).
3.3. More than Three Samples
Nothing essentially new is involved when there are more than three samples. If there are C samples, the mean ranks for any C-1 of them are jointly distributed approximately according to a multivariate normal distribution, provided that the sample sizes are not too small. The exponent of this (C- 1)-variate normal distribution will have the same value whichever set of C-1 samples is used. This value, when multiplied by -2, will be H as given by (1.2), and it will be distributed approximately as X2(C- 1), provided the ni are not too small. The exponent of the approximating multivariate normal distribution is more complicated than for three samples, but it involves only the variances of the 7i as given by (3.6) and the correlations among pairs (Ri, Ri) as given by (3.11).
By using matrix algebra, the general formula for H is obtained quite as readily as the formulas for two and three samples by the methods used in this paper. A mathematically rigorous discussion of H for the general case of C samples is presented elsewhere by Kruskal [25], together with a formal proof that its distribution under the null hypothesis is asymptotically x2.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
598 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
4. INTERPRETATION OF THE TEST
4.1. General Considerations
H tests the null hypothesis that the samples all come from identical populations. In practice, it will frequently be interpreted, as is F in the analysis of variance, as a test that the population means are equal against the alternative that at least one differs. So to interpret it, however, is to imply something about the kinds of differences among the populations which, if present, will probably lead to a significant value of H, and the kinds which, even if present, will probably not lead to a significant value of H. To justify this or any similar interpretation, we need to know something about the power of the test: For what alternatives to identity of the populations will the test probably lead to rejection, and for what alternatives will it probably lead to acceptance of the null hypothesis that the populations are identical? Unfortunately, for the H test as for many nonparametric tests the power is difficult to investigate and little is yet known about it.
It must be recognized that relations among ranks need not conform to the corresponding relations among the data before ranking. It is possible, for example, that if an observation is drawn at random from each of two populations, the one from the first population is larger in most pairs, but the average of those from the second population is larger. In such a case the first population may be said to have the higher average rank but the lower average value.
It has been shown by Kruskal [25] that a necessary and sufficient condition for the H test to be consistent12 is that there be at least one of the populations for which the limiting probability is not one-half that a random observation from this population is greater than an independent random member of the N sample observations. Thus, what H really tests is a tendency for observations in at least one of the populations to be larger (or smaller) than all the observations together, when paired randomly. In many cases, this is practically equivalent to the mean of at least one population differing from the others.
4.2. Comparison of Means when Variability Differs
Rigorously interpreted, all we can conclude from a significant value of H is that the populations differ, not necessarily that the means differ. In particular, if the populations differ in variability we cannot,
12 A test is consistent against an alternative if, when applied at the same level of significance for increasing sample size, the probability of rejecting the nuli hypothesis when the alternative is true approaches unity. Actually, the necessary and sufficient condition stated here must be qualified in a way that is not likely to affect the interpretation of the H test suggested in this paragraph. An exact statement is given in [25].
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 599
strictly speaking, infer from a significant value of H that the means differ. In the data of Table 3.2, for example, the variances of the two chemical methods differ significantly (normal theory probability less than 0.01) and substantially (by a factor of 16), as Brownlee shows [2]. A strict interpretation of H and its probability of less than 0.01 does not, therefore, justify the conclusion that the means of the two chemical methods differ.
There is some reason to conjecture, however, that in practice the H test may be fairly insensitive to differences in variability, and so may be useful in the important "Behrens-Fisher problem" of comparing means without assuming equality of variances. Perhaps, for example, we could conclude that the means of the two chemical methods of Table 3.2 differ. The following considerations lend plausibility to this conjecture (and perhaps suggest extending it to other differences in form):
(i) The analysis of consistency referred to in Section 4.1 shows that if two symmetrical populations differ only by a scale factor about their common mean the H test is not consistent for small significance levels; in other words, below a certain level of significance there is no assurance that the null hypothesis of identical populations will be rejected, no matter how large the samples.
(ii) Consider the following extreme case: Samples of eight are drawn from two populations having the same mean but differing so much in variability that there is virtually no chance that any of the sample from the more variable population will lie within the range of the other sample. Furthermore, the median of the more variable population is at the common mean, so that its observations are as likely to lie above as to lie below the range of the sample from the less variable population. The actual distribution of H under these assumptions is easily computed from the binomial distribution with parameters 8 and 2. Figure 4.1 shows the exact distribution of H under the null hypothesis that the two populations are completely identical, under the symmetrical alternative just described, and under a similar but skew alternative in which the probability is 0.65 that an observation from the more variable population will lie below the range of the other sample and 0.35 that it will lie above. Possible values of H under each hypothesis are those at which occur the risers in the corresponding step function of Figure 4.1, and the probabilities at these possible values of H are given by the tops of the risers. Figure 4.1 shows, for example, that samples in which seven observations from the more variable population lie above and one lies below the eight observations from the less variable population (so that the two values of R are 44 and 92, leading to an H of
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
600 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
6.353) would be judged by the H test to be significant at the 0.010 level using true probabilities (or at the 0.012 level using the x2 approximation), while such samples will occur about seven per cent of the time under the symmetrical alternative and about seventeen per cent under the other. In view of the extreme difference of the variances assumed in the alternatives, it seems rather striking that the cumulative distributions given in Figure 4.1 do not differ more than they do. At least in the case of the symmetrical alternative, the distribution for the null
Pr{H>Ho} 1.000
.800 .600
.400 - - -- -
.300-
.200L
.100 .080-
.060-
.040
-
Skew Alternative
.030
_-?
l
oo?.020_ ~ , .010 Symmetrical Alternative I
.008 - .............................
.006 -
.004
.003 Null Hypothesis I
.002 _
.001 2 3 4 5 6 7 a 9 10 i1 12
HO
FIGURE 4.1. Distribution of H for two samples of 8, under the null hypothesis that the populations are identical and under two alternatives in which the means are the same but the variances are extremely different. (For further specification of the alternatives, see Section 4.2.)
hypothesis seems not too poor a partial smoothing, though on the whole it lies too low.
The applicability of the H test to the Behrens-Fisher problem, particularly in its two-tail form, merits further investigation.
5. RELATED TESTS
5.1. Permutation Tests and Ranks The H test stems from two statistical methods, permutations of the
data, and rank transformations.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 601
Permutation tests, which to the best of our knowledge were first proposed by Fisher [8] in connection with a defense of the normality assumption, accept or reject the null hypothesis according to the probability of a test statistic among all relevant permutations of the observed numbers; a precise general formulation of the method is given by Scheff6 [45]. Applications of the permutation method to important cases may be found in articles by Pitman [41, 42, 43] and by Welch [57].
The use of ranks-or more generally, of conventional numbers-instead of the observations themselves has been proposed often, and we do not know to whom this idea may be credited.'3 Its advantages have been summarized in Section 1.3. Its disadvantage is loss of information about exact magnitudes.
If in one-criterion variance analysis the permutation method based on the conventional F statistic is combined with the rank method, the result is the H test.
5.2. Friedman's Xr2
Two kinds of data must be distinguished in discussing tests for the equality of C population averages. The first kind consists of C independent random samples, one from each population. The second kind consists of C samples of equal size which are matched (that is, crossclassified or stratified, each stratum contributing one observation to each sample) according to some criterion which may affect the values of the observations. This distinction is, of course, exactly that between one-criterion variance analysis with equal sample sizes and two-criterion variance analysis with one observation per cell.
For comparing the weights of men and women, data of the first kind might be obtained by measuring a random sample of ni men and an independent random sample of n2 women. Such data would ordinarily be analyzed by one-criterion variance analysis, as described in Section 1.2 above, which in the two-sample case is equivalent to the two-tail t test with ni+n2-2 degrees of freedom. The H test, or the two-sample version of it given by (3.4), would also be applicable.
Data of the second kind for the same problem might be obtained by selecting n ages (not necessarily all different) and for each age selecting at random one man and one woman. Such data would ordinarily be
13 Our attention has been directed by Harold Hotelling to the use of ranks by Galton [12, Chaps. 4 and 5] in 1889. Churcbill Eisenhart and I. Richard Savage have referred us to the extensive analyses of ranks by eighteenth century French mathematicians in connection with preference-ordering problems, specifically elections. The earliest work they mention is by Borda [1] in 1770, and they mention also Laplace [26] in 1778, Condorcet [3] in 1786, and Todhunter's summary of these and related writings [51, Secs. 690, 806, 989, 990]. Systematic treatment of ranks as a nonparainetric statistical device, however, seems to commence with the work of Hotelling and Pabst [19] in 1936.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
602 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
analyzed by two-criterion variance analysis, the between-sexes component being the one tested. This test would be equivalent to the twotail t test of the mean difference, with n-1 degrees of freedom. Friedman's xr2 [10], or the two-tail sign test which is its two-sample version, would be appropriate.'4
The H test thus provides a rank test for data of the first kind, just as the Xr2 test does for data of the second kind. H makes it possible to test by ranks the significance of a grouping according to a single criterion. The effect of one criterion cannot be tested by Xr2 unless the observations in the different groups are matched according to a second criterion. On the other hand, if the data are matched H is not appropriate and Xr2 should be used.
5.3. Wilcoxon's Two-Sample Test
The H test in its general form is new, as far as we know,"5 but not its two-sample form.
5.3.1. Wilcoxon (1945, 1947). Wilcoxon was the first, we believe, to introduce the two-sample form. His first paper [61] considers the case of two samples of equal size and gives true probabilities for values of the smaller sum of ranks in the neighborhood of the 0.01, 0.02, and 0.05 probability levels for sample sizes from 5 to 10. A method of calculating the true probabilities is given. An example uses the mean-rank method for ties, interpreting the result in terms of a table for the no-ties situation.
In a second paper [62] on the case of two equal samples, Wilcoxon gives a normal approximation to the exact distribution, basing it on the theory of sampling without replacement from a finite uniform population, along the lines of Section 3.1 of the present paper. A table of 5 per cent, 2 per cent, and 1 per cent significance levels for the smaller total is given, covering sample sizes from 5 to 20.
5.3.2. Festinger (1946).16 Wilcoxon's test was discovered independently by Festinger [7], who considers the case where the two sample
sizes, n and m, are not necessarily equal. He gives a method of calculating true probabilities, and a table of two-tail 5 per cent and 1 per cent
14 For other discussions of Xrt, see Kendall and Smith [21], Friedman [11], and Wallis [55]. 15 After an abstract [24] of a theoretical version [25] of the present paper was published we learned from T. J. Terpstra that similar work has been done at the Mathematical Center, Amsterdam, and that papers closely related to the H test will be published soon by himself [50] and by P. G. Rijkoort [44]; also that P. van Elteren and A. Benard are doing some research related to Xr2. References [50] and [44] propose tests based upon statistics similar to, but not identical with, H. Alan Stuart tells us that H. R. van der Vaart (University of Leiden) has been planning a generalization of the Wilcoxon test to several samples. P. V. Krishna Iyer has announced [23] "a non-parametric method of testing k samples.' This brief announcement is not intelligible to us, but it states that "full details will be published in the Journal of the Indian Society of Agricultural Research." 16 We are indebted to Alan Stuart for calling our attention to Festinger's paper.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 603
significance levels for n from 2 to 12 with m from from 13 to 15 with m from n to 30-n; and more extensive tables are available from him. A large proportion of the entries in Festinger's table, especially at the 5 per cent level, seem to be slightly erroneous.'
5.3.3. Mann and Whitney (1947). Mann and Whitney [28] made an important advance in showing that Wilcoxon's test is consistent for the null hypothesis that the two populations are identical against the alternative that the cumulative distribution of one lies entirely above that of the other."7 They discuss the test in terms of a statistic U which, as they point out, is equivalent to Wilcoxon's sum of ranks (our R). When all observations from both samples are arranged in order, they count for each observation in one sample, say the first, the number of observations in the second sample that precede it. The sum of these counts for the first sample is called U. It is related to R, the sum of the
ranks for the first sample, by'8
n(n + 1 (5.1) U= R-
2
They give a table showing the one-tail probability to three decimals for each possible value of U, for all combinations of sample sizes in which the larger sample is from three to eight.'9
Hemelrijk [16] has pointed out recently that U, and consequently R for the two-sample case, may be regarded as a special case of Kendall's coefficient of rank correlation [20].
5.3.4. Haldane and Smith (1948).20 Haldane and Smith [14] developed the Wilcoxon test independently in connection with the problem of deciding whether the probability of a hereditary trait appearing in a particular member of a sibship depends on his birthrank. They propose a test based on the sum of the birth-ranks of those members of a sibship having the trait-i.e., our R-where N is the number in the sibship and n is the number having the trait. They develop an approximate distribution from the theory of sampling from an infinite, continuous, uniform population, and approximate this by the unit normal deviate given in
17 Actually the test is consistent under more general conditions; see Section 5.3.6 (iv). 18 Mann and Whitney's version of this formula is a trifle different because they relate the count in the first sample (our terminology) to the sum of ranks in the other sample. 19 We have recomputed the Mann-Whitney table to additional decimals. It agrees entirely with our computations. 20 We are indebted to Alan Stuart for calling our attention to the Haldane and Smith paper. Blair M. Bennett, University of Washington, is computing power functions for the Wilcoxon test against alternatives appropriate to the birth-order problem. Bennett emphasizes, in a personal communication, that the distribution of R under the null hypothesis corresponds to a partition problem which has been studied in the theory of numbers for centuries-in particular by Euler [6a, Chap. 16], who in 1748 considered closely related partition problems and their generating functions, and by Cauchy [2a, Numbers 225, 226]. In fact, Euler [6a, p. 252*] gives a table which is in part equivalent to that of Mann and Whitney [28]. This number-theoretic approach is discussed by Wilcoxon [61].
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
604 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
this paper as (3.4)-including the continuity adjustment, which they seem to be the first to use. They tabulate the means and variances of 6R for values of N from 2 to 20, with n from 1 to N. They also give a table of exact probabilities (not cumulated) for all possible values of n up to N=12.
Haldane and Smith discuss the problem of ties in connection with multiple births. They propose to assign to each member of each birth the rank of that birth. In our terminology, they give each member of a tied group the lowest of the ranks tied for, and give the next individual or group the next rank, not the rank after the highest in the group tied for. For a test in this case, they refer to the theory of sampling without replacement from a finite but non-uniform population.
With the Haldane-Smith method of handling ties, the difference between the ranks of two non-tied observations is one more than the number of distinct values or groups intervening between the two, regardless of the number of intervening individuals; with the mean-rank method, the difference is one more than the number of observations intervening, plus half the number of other observations having the same rank as either of the two observations being compared. The mean-rank method seems preferable when the cause of ties is measurement limitations on an effectively continuous variable, the Haldane-Smith method when the cause is actual identity. Unfortunately, the Haldane-Smith method does not lend itself so readily as does the mean-rank method to simple adjustment of the formulas for the no-ties case, since the necessary adjustments depend upon the particular ranks tied for, not merely the number of ties.
5.3.5. White (1952). Tables of critical values of R at two-tail significance levels of 5, 1, and 0.1 per cent for all sample sizes in which N<?30 are given by White [59].21 He suggests that ties be handled by the mean-rank method, not allowing for its effect on the significance level, or else by assigning the ranks so as to maximize the final probability, which may then be regarded as an upper limit for the true probability.
5.3.6. Power of Wilcoxon's test. The power of nonparametric tests in general, and of the H test in particular, is difficult to investigate; but
21 Comparison of the 5 and 1 per cent levels given by White with Festinger's earlier and more extensive table [71 shows 104 disagreements among 392 comparable entries (78 disagreements among 196 comparisons at the 5 per cent level, and 26 among 196 at 1 per cent). In each disagreement, Festinger gives a lower critical value of the statistic, although both writers state that they have tabulated the smallest value of the statistic whose probability does not exceed the specified significance level. Three of the disagreements can be checked with the Mann-Whitney table [28]; in all three, White's entry agrees with Mann-Whitney's. In one additional case (sample sizes 4 and 11 at the 1 per cent level) we have made our own calculation and found Festinger's entry to have a true probability (0.0103) exceeding the stated significance level. The disagreements undoubtedly result from the fact that the distributions are discontinuous, so that exact 5 and 1 per cent levels cannot ordinarily be attained.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 605
for the special case of Wilcoxon's two-sample test certain details have been discovered. Some that are interesting from a practical viewpoint are indicated below, but without the technical qualifications to which they are subject:
(i) Lehmann [27] has shown that the one-tail test is unbiased-that is, less likely to reject when the null hypothesis is true than when any alternative is true-but van der Vaart [52] has shown that the corresponding two-tail test may be biased.
(ii) Lehmann [27] has shown, on the basis of a theorem of Hoeffding's [17], that under reasonable alternative hypotheses, as under the
null hypothesis, the distribution of \/1 is asymptotically normal. (iii). Mood [33] has shown that the asymptotic efficiency of Wil-
coxon's test compared with Student's test, when both populations are normal with equal variance, is 3/ir, i.e., 0.955. Roughly, this means that 3/r is the limiting ratio of sample sizes necessary for the two tests to attain a fixed power. This result was given in lecture notes by E. J. G. Pitman at Columbia University in 1948; it was also given by van der Vaart [52]. To the best of our knowledge, Mood's proof is the first complete one.
(iv) Lehmann [27] and van Dantzig [15, 51a], generalizing the findings of Mann and Whitney [28], have shown that the test is consistent'2 if the probability differs from one-half that an observation from the first population will exceed one drawn independently from the second population (for one-tail tests the condition is that the probability differ from one-half in a stated direction). In addition van Dantzig [51a] gives inequalities for the power. The C-sample condition for consistency given by Kruskal (see Section 4.1) is a direct extension of the two sample condition given by Lehmann and van Dantzig.
5.4. Whitney's Three-Sample Test
Whitney [60] has proposed two extensions of the Wilcoxon test to the three-sample case. Neither of his extensions, which are expressed in terms of inversions of order rather than in terms of ranks, is equivalent to our H test for C=3, since Whitney seeks tests with power against more specific alternatives than those appropriate to the H test.
Whitney arrays all three samples in a single ranking and then defines U as the number of times in which an observation from the second sample precedes an observation from the first and V as the number of times in which an observation from the third sample precedes one from the first.22
22 U and V are not determined by R1, R2, and R,, nor vice versa, though U + V - - ini(ni + 1)
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
606 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
Whitney's first test, which rejects the null hypothesis of equality of the populations if both U and V are too small (alternatively, too large), is suggested when the alternative is that the cumulative distribution of the first population lies above (alternatively, below) those of both the second and third populations. His second test, which rejects if U is too large and V is too small, is suggested when the alternative is that the cumulative distribution of the first population lies below that of the second and above that of the third.
5.5 Terpstra's C-sample Test.
Terpstra [50a] has proposed and investigated a test appropriate for alternatives similar to those of Whitney's second test, but extending to any number of populations.
5.6. Mosteller's C-Sample Test
Mosteller [34] has proposed a multi-decision procedure for accepting either the null hypothesis to which the H test is appropriate or one of the C alternatives that the ith population is translated to the right (or left) of the others. His criterion is the number of observations in the sample containing the largest observation that exceed all observations in other samples. This procedure has been discussed further by Mosteller and Tukey [35].
5.7. Fisher and Yates' Normalized Ranks
Fisher and Yates have proposed [9, Table XX] that each observation be replaced not by its simple rank but by a normalized rank, defined as the average value of the observation having the corresponding rank in samples of N from a normal population with mean of zero and standard deviation of one. They propose that ordinary one-criterion variance analysis then be applied to these normalized ranks. Ehrenberg [5] has suggested as a modification using the values of a random sample of N from the standardized normal population.
Two advantages might conceivably be gained by replacing the observations by normalized ranks or by some other set of numbers instead of by simple ranks. First, it might be that the distribution theory would be simplified. Quite a large class of such transformations, for example,
lead to tests whose distribution is asymptotically X2(C- 1); but for
some transformations the x2 approximation may be satisfactory at smaller sample sizes than for others, thus diminishing the area of need for special tables and approximations such as those presented in Sec. 6.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 607
Second, the power of the test might be greater against important classes of alternatives.
Whether either of these possible advantages over ranks is actually realized by normalized ranks, or by any other specific transformation, has not to our knowledge been investigated. Offhand, it seems intuitively plausible that the X2 distribution might be approached more rapidly with normalized ranks, or some other set of numbers which resemble the normal form more than do ranks. On the other hand, it seems likely that if there is such an advantage it is not very large, partly because the distribution of means from a uniform population approaches normality rapidly as sample size increases, and partly because (as Section 6 indicates) the distribution of H approaches the x2 distribution quite rapidly as sample sizes increase. As to power, we have no suggestions, except the obvious one that the answer is likely to differ for different alternatives of practcal interest.23
5.8. Other Related Tests
A number of tests have been proposed which have more or less the same purpose as H and are likewise non-parametric. We mention here only two of the principal classes of these.
5.8.1. Runs. Wald and Wolfowitz [53] have proposed for the twosample case that all observations in both samples be arranged in order of magnitude, that the observations then be replaced by designations A or B, according to which sample they represent, and that the number of runs (i.e., groups of consecutive A's or consecutive B's) be used to test the null hypothesis that both samples are from the same population. The distribution theory of this test has been discussed by Stevens
[48], Wald and Wolfowitz [53], Mood [31], Krishna Iyer [22], and oth-
ers; and Swed and Eisenhart [49] have provided tables covering all cases in which neither sample exceeds 20. For larger samples, normal approximations are given by all the writers mentioned. Wald and Wolfowitz discussed the consistency of the test, and later Wolfowitz [63] discussed its asymptotic power. An extension to cases of three or more samples has been given by Wallis [56], based on the distribution theory of Mood and Krishna Iyer.
5.8.2. Order statistics. Westenberg [58] has suggested a test for the two-sample case utilizing the number of observations in each sample above the median of the combined samples. Mood and Brown [32, pp.
23 When the true distributions are normal, Hoeffding [18] has shown that in many cases, includ at least some analysis of variance ones, the test based on normalized ranks becomes as powerful as that based on the actual observations, when the sample sizes increase toward infinity.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
608 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
394-5, 398-9] have discussed the test further and generalized it to several samples. Massey [29] has generalized the test further by using other order statistics of the combined samples as a basis for a more refined classification.
6. SIGNIFICANCE LEVELS, TRUE AND APPROXIMATE
6.1. True Significance Levels
6.1.1. Two samples. Festinger [7], Haldane and Smith [14], Mann and Whitney [28], White [59], and Wilcoxon [61, 62] have published tables for the two-sample case. These are described in Section 5.3. They are exact only if ties are absent or are handled by the randomrank method, but our guess is that they will also serve well enough if the mean-rank method is used and there are not too many ties.
6.1.2. Three samples. (i) Five or fewer observations in each sample. For each of these cases, Table 6.1 shows three pairs of values of H and their probabilities of being equalled or exceeded if the null hypothesis is true24. Each pair brackets as closely as possible the 10, 5, or 1 per cent level, except that in some cases one or both members of a pair are missing because H can take only a small number of values. The final sentence of Section 6.1.1, about ties, applies to Table 6.1 also.
(ii) More than five observations in each sample. No exact tables are available for these cases. Our recommendation is that the X2 approximation be used. Only at very small significance levels (less than 1 per cent, say) and sample sizes only slightly above five is there likely to be appreciable advantage to the more complicated r and B approximations described in Section 6.2. This recommendation is based only on the comparisons shown in Table 6.1, no true probabilities having been computed in this category.
(iii) Intermediate cases. No exact tables are available here. The r and B approximations probably should be resorted to if more than roughly approximate probabilities are required. Except at very low significance levels or with very small samples, the r approximation, which is simpler, should serve. This recommendation is not very firm, however, since we have computed no true probabilities in this category.
6.1.3. More than three samples. Since we have computed no true probabilities for more than three samples, our recommendations here
24 These computations and others used for this paper were made by John P. Gilbert with the assistance of Billy L. Foster, Thomas 0. King, and Roland Silver. Space prevents reproducing all or even most of the results, but we hope to file themin such a way that interested workers may have access to them. We have the true joint distributions of R1, R2, and Rs under the null hypothesis for ni, n2, and ns, each from 1 through 5, and the true distribution of H under the same conditions, except that for some cases we have probabilities only for those values of H exceeding the upper twenty per cent level.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 609
must be entirely tentative. It seems safe to use the X2 approximation when all samples are as large as five. If any sample is much smaller than five, the r or B approximation should probably be used, especially at low significance levels, though the importance of this presumably is less the larger the proportion of samples of more than five.
6.2. Approximate Significance Levels 6.2.1. X2 approximation. This is the approximation discussed in Sec-
tions 1, 2, and 3. The most extensive single table is that of Hald and Sinkbaek [13], though the table in almost any modern statistics text will ordinarily suffice.
6.2.2. r approximation. This utilizes the incomplete-r distribution by matching the variance as well as the true mean of H. The mean, or expected value, of H under the null hypothesis is [25]
(6.1) E= C-1
and the variance is
(6.2) 2[3C2 - 6C + N(2C2 - 6C + 1)] 6 C 1
5N(N +1) 5 j-1 ni
One way of applying the approximation is to enter an ordinary x2 table taking x2= 2HE/V and degrees of freedom f= 2E2/V. Note that the degrees of freedom will not ordinarily be an integer, so interpolation will be required in both x2 and the degrees of freedom if the four bounding tabular entries do not define the probability accurately enough.25
6.2.3. B approximation. This utilizes the incomplete-B distribution by matching the true maximum as well as the mean and variance of H. The maximum value of H is [25]
c
N8 - 2ni
(6.3) M = _ N(N+ 1)
To apply the approximation, K. Pearson's table of the incomplete-B distribution [39] may be employed, but it is usually more convenient to use the F distribution, a form of the incomplete-B distribution, since
2a The r approximations shown in Table 6.1 were based on K. Pearson's table of the incomple function [40]. In Pearson's notation, the required probability is 1 -I(u, p), where u H/ v V and p = E2/V -1. We used linear double interpolation, which on a few tests seemed to be satisfactory in the
region of interest.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
610 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
tables of F are widely accessible to statisticians.26 We set
(6.4) F = H(M-E)
E(M-H)
with degrees of freedom (not usually integers)
E(M-E)-V (6.5) fi = E.
,1MV
E(M-E)-V M -E (6.6) f2 = (M - E) -MV - E
The probability may then be obtained by three-way interpolation in the F tables or by using Paulson's approximation [36], according to which the required probability, P, is the probability that a unit normal deviate will exceed
(6 7) - (1 - 2/9f2)F' + 2/9fi - 1
V/2F'2/9f2 + 2/9fi
where F'=-F. As an illustration, suppose C=3, ni= 5, n2= 4, n3 =3, and
From (6.1), (6.2), and (6.3) we find E=2, V =3.0062, and M=9.6923. Substituting these into (6.4), (6.5), and (6.6) gives F= 5.332, fi = 1.699, and f2= 6.536. Then (6.7) gives Kp = 1.690, for which the normal distribution shows a probability of 0.046. This may be compared with the true probability of 0.050, the x2 approximation of 0.060, and the r approximation of 0.044, shown in Table 6.1.27
26 The most detailed table of the F distribution is that of Merrington and Thompson [30]. 27 The B approximations shown in Table 6.1 are based on K. Pearson's table of the incomplete-B function [39]. In Pearson's notation, the required probability is 1 -Iz(p, q), where x =H/M, p =fi, and q = ifi. To simplify the three-way interpolation, the following device (based on the relation of the incomplete-B to the binomial distribution, and of the binomial to the normal distribution) was used: First, let po, qo, and x, be the tabulated arguments closest to p, q, and x, and as a first approximation to the required probability take 1 -Ih(po, qo). Second, add to this first approximation the probability that a unit normal deviate will not exceed (in algebraic, not absolute, value)
p 2 I -x(p +q -1)
K =
/*X(1 - x) (p + q - 1) Third, subtract from this the probability that a unit normal deviate will not exceed Ko, where Ko is defined like K but in terms of po, qo, and xo. This method of interpolation was compared at three points
with the trivariate Everett formula to third differences as presented by Pearson [39, Introduction]. The results were not excellent, but seemed to suffice for the present purposes.
Strictly speaking, all our statements and numerical results concerning the B approximation (including entries in Table 6.1) actually apply to that approximation based on Pearson's tables in combination with this normal interpolation device.
Values calculated in this way will not in general agree precisely with those calculated by inter. polating in the F tables or by using Paulson's approximation, though the example in the text agrees to three decimals.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 611
'__8o g g g 0 g g 0 0 00 0 0 0 O D tt rN IQ w0 CM 0. (D ) *0 CM '-0 000 0 0 0
Alj /
E
x
/
/
0
A '~~99q0 0 0 009900 0m
X~~~~~~~~~~
I I F ;tg/ I ! a 1@ tM?ThFT I
ol~~~~~~~~~~~~~
s/~~~~~~~~~~~~ /
_
_
_/
_
_
/
_
44)
IL
044)
_ _
0'
0
j~~~~~~~ c C _ t t . " ____ . . _ ._____ i,~~~~~~C OD
O v~~~~~~~~~~~~~~~~~~~~~~~~~
E-4)
14)
x0~~~~~~~~~~~~~~~~~~~~~
Al~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
I 0 0 O O t O O <) O O 0 0 O O OD ( D u ) st 0) oi 0 CL OD? 000 4?0)S 0 - 0 O o0 O ODODOtnOTO n C O -sO0 ) ,O0O0OO O,)oO
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
612 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
--o 0 0 o 0 0 0 0 0 0 o 0 0 0 0o co D fn c\ _ 000 0 o 0 0 0 o0 o@ Wtfl ) oJ -0 0 0 0 0 0
I Q a ' 9000 e 4 7 < - O 9 O O 0 O 099 0 0 o 0
I 7 | ||,rZ Y l 7
CL Z / /
0
E0 C /*/ c CL~~~~~~~~~~~~~~~~~~~~~C
. . . / tZ . -/--e'.-.s. =Q~~~m~~~~~~ W 0 0c t _f _l_ S_ 1 _ _ ~
N)
//00990 9 999900
/~~~~~~~~~~~~~~~~~~~- Cs
I i /f - I '''- ' @ mER~~~~~~~
/y~~~~~~~~~~~~~~~~~~~C /Ca //A<//~~~~~~~~~~~~~~~~~~~~~~~~~~0~~~~~~~~~~~~~~~~~~~~~P~~4~~l O 0
,- _ - .,~~~~~~~~~~~~~~~c
X Y r;~~~~~~~~~~~~~~~~~~~~~~~~~~~
/o/ =~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~l /^/ ,i,;, ?~~~~~~~~~~~~~~~~~~~~~~~~E4X
xf/ v v ? O =~~~~~~~~~~~~~~c
/>/ r t~~~~~~~~~~~~~~~~~~~~~~~~~~~~P
_ 0 _ _w 0 - 0 0 -) q .1 \ - __ . . 0- JZ 0 OD (.u rf/ CM - - - . , 0 0
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 613
.0 I .02I II -I - T I I I I
.01
_
? B Approximation
X
-
-ox
2
x
0 0 X?r Approximation
A 0 A 0l ( 0/ 31c0\ ._ - 0 .t 20 0 .- .0 02 0 3 3 .03 X 4I I.01 I5I.O I01 6I I.I 01 7 I0 .I 0 I- 8 I.0 I 09II0 .I 1I0
I I I 11 .12 I I
.13 I
.i
.02
0
_
E c ~ _ 0 0 03
x
0
0 .01 .02 .03 .04 .05 .06 .07 .08 .09 .10 .11 .12 .13 .14
fF oU r EH . 0 inot I hA epn per gobx rio m ds to ioh n , ,ad
fr X in thXegbrod fte1 ,ad10prcn ons o he ape
of size02 to 5. Crse niaeta h mletsml i:b exeds2,(;c
that it is 2.Cases involvng samples o 1 and a few nvolving samles of 2 hav
b tees n s om p it oed1d w vol
be .1 .1 om it 1
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
614 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
TABLE 6.1 TRUE DISTRIBUTION OF H FOR THREE SAMPLES, EACH OF
SIZE FIVE OR LESS, IN THE NEIGHBORHOOD OF THE 10, 5, AND 1 PER CENT POINTS; AND COMPARISON WITH THREE APPROXIMATIONS
The probabilities shown are the probabilities under the null hypothesis
that H will equal or exceed the values in the column headed "H"
Approximate minus true Sample Sizes True probability
l n2n3i HnP2robnilitH y xP2 r(Loinbeaar-(NrorB mal
Interp.) Interp.)
2 1 1 2.7000 .500 -.241 -.309 -.500
2 2 1 3.6000 .267 - .101 - .167 - .267
2 2 2 4.5714 .067 +.035 -.007 -.067 3.7143 .200 - .044 - .083 +.010
3 1 1 3.2000 .300 - .098 - .180 - .300
3 2 1 4.2857 .100 +.017 - .040 - .100 3.8571 .133 +.012 - .045 - .042
3 2 2 5.3572 .029 +.040 +.083 -.029 4.7143 .048 +.047 +.012 +.014 4.5000 .067 +.039 +.003 +.020 4.4643 .105 +.002 - .033 - .014
3 3 1 5.1429 .043 +.034 - .010 - .043 4.5714 .100 +.002 - .046 - .062 4.0000 .129 +.007 - .041 - .024
3 3 2 6.2500 .011 +.033 +.012 -.011 5.3611 .032 +.036 +.010 +.001 5.1389 .061 +.016 - .012 - .019 4.5556 .100 +.002 - .027 - .020 4.2500 .121 - .002 - .031 - .014
3 3 3 7.2000 .004 +.024 +.010 -.004 6.4889 .011 +.028 +.011 - .001 5.6889 .029 +.030 +.009 +.003 5.6000 .050 +.011 - .010 - .015 5.0667 .086 - .006 - .029 - .026 4.6222 .100 -.001 -.025 -.010
4 1 1 3.5714 .200 - .032 - .114 - .200
4 2 1 4.8214 .057 +.033 -.017 -.057 4.5000 .076 +.029 - .022 - .047 4.0179 .114 +.020 - .032 - .056
4 2 2 6.0000 .014 +.036 +.010 -.014 5.3333 .033 +.036 +.007 - .017 5.1250 .052 +.025 - .006 - .021 4.3750 .100 +.012 - .020 - .002 4.1667 .105 +.020 - .012 +.014
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 615
TABLE 6.1 (Continued)
Approximate minus true
Sample Sizes H True probability
Proba- r B
nli n2 na bility ( (Linear (Normal
Interp.) Interp.)
4 3 1 5.8333 .021 +.033 -.001 -.021 5.2083 .050 +.024 - .016 - .037 5.0000 .057 +.025 - .016 - .034 4.0556 .093 +.039 - .005 +.014 3.8889 .129 +.014 - .028 - .003
4 3 2 6.4444 .009 +.031 +.012 -.002 6.4222 .010 +.030 +.011 - .004 5.4444 .047 +.019 - .005 - .010 5.4000 .052 +.016 - .008 - .013 4.5111 .098 +.006 - .020 - .004 4.4667 .101 +.006 - .020 - .003
4 3 3 6.7455 .010 +.024 +.010 -.001 6.7091 .013 +.022 +.007 - .003 5.7909 .046 +.010 - .009 - .013 5.7273 .050 +.007 - .012 - .015 4.7091 .094 +.001 - .021 - .006 4.7000 .101 - .006 - .027 - .012
4 4 1 6.6667 .010 +.026 +.002 -.010 6.1667 .022 +.024 - .005 - .020 4.9667 .048 +.036 - .003 - .009 4.8667 .054 +.034 - .005 - .009 4.1667 .082 +.042 +.002 +.016 4.0667 .102 +.029 - .011 +.007
4 4 2 7.0364 .006 +.024 +.010 -.002 6.8727 .011 +.021 +.006 - .005 5.4545 .046 +.020 - .002 - .003 5.2364 .052 +.021 - .002 +.001 4.5545 .098 +.005 - .019 - .003 4.4455 .103 +.006 - .018 +.000
4 4 3 7.1439 .010 +.018 +.007 -.002 7.1364 .011 +.018 +.006 - .003 5.5985 .049 +.012 - .005 - .004 5.5758 .051 +.011 - .006 - .005 4.5455 .099 +.004 - .015 +.003 4.4773 .102 +.004 -.014 +.004
4 4 4 7.6538 .008 +.014 +.005 .000 7.5385 .011 +.012 +.003 - .002 5.6923 .049 +.009 - .006 - .002 5.6538 .054 +.005 - .010 - .007 4.6539 .097 +.001 - .015 +.004 4.5001 .104 +.001 - .015 +.007
5 1 1 3.8571 .143 +.003 - .109 -.143
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
616 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
TABLE 6.1 (Continued)
Approximate minus true Sample Sizes True probability
H Proba- B
nl n2 n3 bility x2 (Linear (Normal
Interp.) Interp.)
5 2 1 5.2500 .036 +.037 -.006 -.036 5.0000 .048 +.034 +.011 - .037 4.4500 .071 +.037 - .012 - .020 4.2000 .095 +.027 - .022 - .018 4.0500 .119 +.013 - .036 - .024
5 2 2 6.5333 .008 +.030 +.010 -.008 6.1333 .013 +.033 +.010 - .010 5.1600 .034 +.041 +.013 +.008 5.0400 .056 +.025 - .004 - .006 4.3733 .090 +.022 - .007 +.010 4.2933 .122 - .005 - .034 - .014
5 3 1 6.4000 .012 +.029 +.002 - .012 4.9600 .048 +.036 - .004 - .010 4.8711 .052 +.036 - .004 - .009 4.0178 .095 +.039 - .002 +.018 3.8400 .123 +.024 - .016 +.010
5 3 2 6.9091 .009 +.023 +.007 - .006 6.8218 .010 +.023 +.007 - .006 5.2509 .049 +.023 - .000 +.001 5.1055 .052 +.026 +.003 +.006 4.6509 .091 +.006 - .018 - .005 4.4945 .101 +.005 - .020 - .003
5 3 3 6.9818 .010 +.020 +.008 -.002 6.8606 .011 +.022 +.008 - .001 5.4424 .048 +.018 - .000 +.002 5.3455 .050 +.019 +.000 +.004 4.5333 .097 +.007 - .013 +.004 4.4121 .109 +.001 - .018 +.000
5 4 1 6.9545 .008 +.023 +.002 -.008 6.8400 .011 +.022 - .000 - .011 4.9855 .044 +.038 +.002 - .001 4.8600 .056 +.032 - .005 - .005 3.9873 .098 +.038 +.001 +.018 3.9600 .102 +.036 - .000 +.018
5 4 2 7.2045 .009 +.018 +.005 - .005 7.1182 .010 +.018 +.005 - .005 5.2727 .049 +.023 +.002 +.005 5.2682 .050 +.021 +.000 +.004 4.5409 .098 +.005 - .017 - .002 4.5182 .101 +.004 - .018 - .002
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 617
TABLE 6.1 (Continued)
Approximate minus true
Sample Sizes True probability H Proba- B
nl n2 n3 bility x2 (Linear (Normal
Interp.) Interp.)
5 4 3 7.4449 .010 +.014 +.004 -.004 7.3949 .011 +.014 +.004 - .004 5.6564 .049 +.010 - .005 - .004 5.6308 .050 +.010 - .006 - .004 4.5487 .099 +.004 - .013 +.003 4.5231 .103 +.001 - .016 - .000
5 4 4 7.7604 .009 +.01l +.003 -.002 7.7440 .011 +.010 +.002 - .003 5.6571 .049 +.010 - .004 +.000 5.6176 .050 +.010 - .004 +.001 4.6187 .100 - .001 - .016 +.003 4.5527 .102 +.001 - .014 +.005
5 5 1 7.3091 .009 +.016 -.002 -.009 6.8364 .011 +.022 +.001 - .009 5.1273 .046 +.031 - .003 - .005 4.9091 .053 +.032 - .002 - .002 4.1091 .086 +.042 +.007 +.020 4.0364 .105 +.028 - .007 +.008
5 5 2 7.3385 .010 +.016 +.004 -.004 7.2692 .010 +.016 +.004 - .004 5.3385 .047 +.022 +.003 +.006 5.2462 .051 +.022 +.002 +.007 4.6231 .097 +.002 - .018 - .005 4.5077 .100 +.005 - .016 - .001
5 5 3 7.5780 .010 +.013 +.004 -.001 7.5429 .010 +.013 +.004 - .002 5.7055 .046 +.012 - .003 +.000 5.6264 .051 +.009 - .005 - .002 4.5451 .100 +.003 - .012 +.007 4.5363 .102 +.002 - .014 +.005
5 5 4 7.8229 .010 +.010 +.003 -.002 7.7914 .010 +.010 +.003 - .002 5.6657 .049 +.010 - .003 +.001 5.6429 .050 +.009 - .003 +.001 4.5229 .099 +.005 - .009 +.010 4.5200 .101 +.004 - .010 +.008
5 5 5 8.0000 .009 +.009 +.003 -.002 7.9800 .010 +.008 +.002 - .003 5.7800 .049 +.007 - .005 - .001 5.6600 .051 +.008 - .004 +.001 4.5600 .100 +.003 -.010 +.008 4.5000 .102 +.004 - .009 +.009
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
618 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
6.3. Comparisons of True and Approximate Significance
Figures 6.1 and 6.2 show the true probabilities and the X2, r, and B approximations when the sample sizes are 3, 4, and 5, and when they are all 5.28
For each entry in Table 6.1 the probabilities given by the three approximations have been computed and their errors recorded in the last three columns of the table. In Figure 6.3 these errors are graphed against the true probabilities. To avoid confusing this figure, sample sizes have not been indicated; cases involving samples of one have been omitted, and cases involving samples of two have been distinguished from those in which the smallest sample exceeds two.
7. REFERENCES
[1] Borda, Jean Charles, "Memoire sur les 6lections au scrutin," Memoires de l'Academie royale des Sciences de Paris pour l'Annee 1781, pp. 657-65.
[2] Brownlee, K. A., Industrial Experimentation, Third American Edition, Brooklyn, Chemical Publishing Company, 1949.
[2a] Cauchy, D'Augustin, "Oeuvres compl6tes," Series 1, Volume 8, Paris, Gauthier-Villars et Fils, 1893.
[31 Condorcet, le Marquis de (Marie Jean Antoine Nicolas Caritat), Essai
sur l'application de l'analyse d la probabilite des dgcisions rendues d la pluralitg des voix, Paris, 1785, pp. lvii, clxxvii ff. [4] DuBois, Philip, "Formulas and tables for rank correlation," Psychological Record, 3 (1939), 46-56. [5] Ehrenberg, A. S. C., "Note on normal transformations of ranks," British Journal of Psychology, Statistical Section, 4 (1951), 133-4. [6] Eudey, M. W., On the treatment of discontinuous random variables, Technical Report Number 13, Statistical Laboratory, University of California (Berkeley), 1949. [6a] Euler, Leonhard, 'Introduction & l'analyse infinitesimale," (translated from the Latin edition of 1748 into French by J. B. Labey), Vol. 1, Paris, Chez Barrois, 1796. [7] Festinger, Leon, "The significance of differences between means without reference to the frequency distribution function," Psychometrika, 11 (1946), 97-105. [8] Fisher, R. A., The Design of Experiments, Edinburgh, Oliver and Boyd Ltd., 1935 and later. [9] Fisher, Ronald A., and Yates, Frank, Statistical Tables for Biological, Agricultural and Medical Research, Edinburgh, Oliver and Boyd Ltd., 1938 and later. [10] Friedman, Milton, "The use of ranks to avoid the assumption of normalit implicit in the analysis of variance," Journal of the American Statistical Association, 32 (1937), 675-701. [11] Friedman, Milton, "A comparison of alternative tests of significance for the problem of m rankings," Annals of Mathematical Statistics, 11 (1940), 86-92.
28 All four figures in this paper are the work of H. Irving Forman.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 619
[12] Galton, Sir Francis, Natural Inheritance, London, Macmillan and Co., 1889. [13] Hald, A., and Sinkbaek, S. A., "A table of percentage points of the x2-dis-
tribution," Skandinavisk Aktuarietidskrift, 33 (1950), 168-75. [14] Haldane, J. B. S., and Smith, Cedric A. B., "A simple exact test for birth-
order effect," Annals of Eugenics, 14 (1947-49), 117-24. [15] Hemelrijk, J., "A family of parameterfree tests for symmetry with respect
to a given point. II," Proceedings, Koninklijke Nederlandse Akademie van Wetenschappen, 53 (1950), 1186-98. [16] Hemelrijk, J., "Note on Wilcoxon's two-sample test when ties are present," Annals of Mathematical Statistics, 23 (1952), 133-5. [17] Hoeffding, Wassily, "A class of statistics with asymptotically normal distributions," Annals of Mathematical Statistics, 19 (1948), 293-325. [18] Hoeffding, Wassily, "Some powerful rank order tests" (abstract), Annals of Mathematical Statistics, 23 (1952), 303. [18a] Horn, Daniel, "A correction for the effect of tied ranks on the value of the rank difference correlation coefficient," Journal of Educational Psychology, 33 (1942), 686-90. [19] Hotelling, Harold, and Pabst, Margaret Richards, "Rank correlation and tests of significance involving no assumption of normality," Annals of Mathematical Statistics, 7 (1936), 29-43. [20] Kendall, Maurice G., Rank Correlation Methods, London, Charles Griffin and Company, 1948. [21] Kendall, Maurice G., and Smith, B. Babington, "The problem of m rankings," Annals of Mathematical Statistics, 10 (1939), 275-87. [22] Krishna Iyer, P. V., "The theory of probability distributions of points on a line," Journal of the Indian Society of Agricultural Statistics, 1 (1948), 173-95. [23] Krishna Iyer, P. V., "A non-parametric method of testing k samples," Nature, 167 (1951), 33. [24] Kruskal, William H., "A nonparametric analogue based upon ranks of oneway analysis of variance" (abstract), Annals of Mathematical Statistics, 23 (1952), 140. [25] Kruskal, William H., "A nonparametric test for the several sample problem," Annals of Mathematical Statistics, 23 (1952), 525-40. [26] Laplace, Pierre Simon, A Philosophical Essay on Probabilities, New York, Dover Publications, Inc., 1951 (first edition 1814). [27] Lehmann, E. L., "Consistency and unbiasedness of certain non-parametric tests," Annals of Mathematical Statistics, 22 (1951), 165-79. [28] Mann, H. B., and Whitney, D. R., "On a test of whether one of two random variables is stochastically larger than the other," Annals of Mathematical Statistics, 18 (1947), 50-60. [29] Massey, Frank J., Jr., "A note on a two-sample test," Annals of Mathematical Statistics, 22 (1951), 304-6. [30] Merrington, Maxine, and Thompson, Catherine M., "Tables of percentage points of the inverted Beta (F) distribution," Biometrika, 33 (1943), 73-88. [31] Mood, A. M., "The distribution theory of runs," Annals of Mathematical Statistics, 11 (1940), 367-92. [32] Mood, Alexander McFarlane, Introduction to the Theory of Statistics, New York, McGraw-Hill Book Co., 1950.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
620 AMERICAN STATISTICAL ASSOCIATION JOURNAL, DECEMBER 1952
[33] Mood, A. M. Unpublished manuscript, submitted to Annals of Mathematical Statistics.
[34] Mosteller, Frederick, "A k-sample slippage test for an extreme population," Annals of Mathematical Statistics, 19 (1948), 58-65.
[35] Mosteller, Frederick, and Tukey, John W., "Significance levels for a ksample slippage test," Annals of Mathematical Statistics, 21 (1950), 120-3.
[36] Paulson, Edward, "An approximate normalization of the analysis of variance distribution," Annals of Mathematical Statistics, 13 (1942), 233-5.
[37] Pearson, E. S., "On questions raised by the combination of tests based on discontinuous distributions," Biometrika, 37 (1950), 383-98.
[38] Pearson, Karl, "On a certain double hypergeometrical series and its representation by continuous frequency surfaces," Biometrika, 16 (1924), 172-88.
[39] Pearson, Karl, editor, Tables of the Incomplete Beta Function, London, Biometrika Office, 1934.
[401 Pearson, Karl, editor, Tables of the Incomplete r-Function, London, Biometrika Office, 1951 (reissue).
[41] Pitman, E. J. G., "Significance tests which may be applied to samples from any populations," Supplement to the Journal of the Royal Statistical Society, 4 (1937), 119-30.
[42] Pitman, E. J. G., "Significance tests which may be applied to samples from any populations. II. The correlation coefficient test," Supplement to the Journal of the Royal Statistical Society, 4 (1937), 225-32.
[43] Pitman, E. J. G., "Significance tests which may be applied to samples from any populations. III. The analysis of variance test," Biometrika, 29 (1937), 322-35.
[44] Rijkoort, P. G., "A generalization of Wilcoxon's test," Proceedings, Koninklijke Nederlandse Akademie van Wetenschappen, 53 (1952).
[45] Scheffe, Henry, "Statistical inference in the non-parametric case," Annals of Mathematical Statistics, 14 (1943), 305-32.
[46] Snedecor, George W., Statistical Methods, Ames, Iowa State College Press, 1937 and later.
[47] Splawa-Neyman, Jerzy, "Pr6ba uzasadnienia zastosowafi rachunku prawdopodobiefistwa do do6wiadczefi polowych. (Sur les applications de la theorie des probabilit6s aux exp6riences agricoles. Essay des principes)," Roczniki Nauk Rolniczych, 10 (1923), 1-51. (Polish with German summary.)
[48] Stevens, W. L., "Distribution of groups in a sequence of alternatives," Annals of Eugenics, 9 (1939), 10-17.
[48a] 'Student,' "An experimental determination of the probable error of Dr. Spearman's correlation coefficient," Biometrika, 13 (1921), 263-82. Reprinted in 'Student's' Collected Papers (edited by E. S. Pearson and John Wishart), London, Biometrika Office, n.d., 70-89.
[49] Swed, Frieda S., and Eisenhart, C., "Tables for testing randomness of grouping in a sequence of alternatives," Annals of Mathematical Statistics, 14 (1943), 66-87.
[50] Terpstra, T. J., "A non-parametric k-sample test and its connection with the H test." Unpublished manuscript.
[50a] Terpstra, T. J., "The asymptotic normality and consistency of Kendall's test against trend, when ties are present in one ranking," Indagationes
Mathematicae, 14 (1952), 327-33.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms
USE OF RANKS IN ONE-CRITERION VARIANCE ANALYSIS 621 [51] Todhunter, Isaac, A History of the Mathematical Theory of Probability
from the Time of Pascal to That of Laplace, New York, Chelsea Publishing Company, 1949 (first edition 1865). [51al van Dantzig, D., "On the consistency and the power of Wilcoxon's two sample test," Indagationes Mathematicae, 13 (1951), 1-8; also Proceedings, Koninklijke Nederlandse Akademie van Wetenschappen, 54 (1951), 1-8. [52] van der Vaart, H. R., "Some remarks on the -power of Wilcoxon's test for the problem of two samples," Proceedings, Koninklijke Nederlandse Akademie van Wetenschappen, 53 (1950), 494-506, 507-20. [53] Wald, A., and Wolfowitz, J., "On a test whether two samples are from the same population," Annals of Mathematical Statistics, 11 (1940), 147-62. [54] Wald, A., and Wolfowitz, J., "Statistical tests based on permutations of the observations," Annals of Mathematical Statistics, 15 (1944), 358-72. [55] Wallis, W. Allen, "The correlation ratio for ranked data," Journol of the American Statistical Association, 34 (1939), 533-8. [56] Wallis, W. Allen, "Rough-and-ready statistical tests," Industrial Quality Control, 8 (1952), 35-40. [57] Welch, B. L., "On the z-test in randomized blocks and Latin Squares," Biometrika, 29 (1937), 21-52. [58] Westenberg, J., "Significance test for median and interquartile range in samples from continuous populations of any form," Proceedings, Koninklijke Nederlandse Akademie van Wetenschappen, 51 (1948), 252-61. [59] White, Colin, "The use of ranks in a test of significance for comparing two treatments," Biometrics, 8 (1952), 33-41. [60] Whitney, D. R., "A bivariate extension of the U statistic," Annals of Mathematical Statistics, 22 (1951), 274-82. [61] Wilcoxon, Frank, 'Individual comparisons by ranking methods," Biometrics Bulletin (now Biometrics), 1 (1945), 80-3. [62] Wilcoxon, Frank, "Probability tables for individual comparisons by ranking methods," Biometrics, 3 (1947), 119-22. [63] Wolfowitz, J., "Non-parametric statistical inference," Proceedings of the Berkeley Symposium on Mathematical Statistics and Probability (edited by Jerzy Neyman), Berkeley and Los Angeles, University of California Press, 1949, 93-113.
This content downloaded from 130.75.137.124 on Wed, 17 Jan 2024 16:39:17 +00:00
All use subject to https://about.jstor.org/terms

View File

@@ -0,0 +1,13 @@
Title: Use of Ranks in One-Criterion Variance Analysis
Creator: page2pdf-2.1
Producer: iText® 5.5.8 ©2000-2015 iText Group NV (AGPL-version); modified using iText® 7.1.3 ©2000-2018 iText Group NV (JSTOR Michigan; licensed version)
CreationDate: 08/08/16 14:47:12
ModDate: 09/15/20 00:10:21
Tagged: yes
Form: none
Pages: 40
Encrypted: no
Page size: 595 x 882 pts (rotated 0 degrees)
File size: 3284124 bytes
Optimized: no
PDF version: 1.7

View File

@@ -0,0 +1,827 @@
See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/3316656
Bidirectional recurrent neural networks
Article in IEEE Transactions on Signal Processing · December 1997
DOI: 10.1109/78.650093 · Source: IEEE Xplore
CITATIONS
9,304
2 authors, including: Mike Schuster 34 PUBLICATIONS 12,996 CITATIONS
SEE PROFILE
READS
32,705
All content following this page was uploaded by Mike Schuster on 01 January 2016.
The user has requested enhancement of the downloaded file.
IEEE TRANSACTIONS ON SIGNAL PROCESSING, VOL. 45, NO. 11, NOVEMBER 1997
2673
Bidirectional Recurrent Neural Networks
Mike Schuster and Kuldip K. Paliwal, Member, IEEE
Abstract— In the first part of this paper, a regular recurrent neural network (RNN) is extended to a bidirectional recurrent neural network (BRNN). The BRNN can be trained without the limitation of using input information just up to a preset future frame. This is accomplished by training it simultaneously in positive and negative time direction. Structure and training procedure of the proposed network are explained. In regression and classification experiments on artificial data, the proposed structure gives better results than other approaches. For real data, classification experiments for phonemes from the TIMIT database show the same tendency.
In the second part of this paper, it is shown how the proposed bidirectional structure can be easily modified to allow efficient estimation of the conditional posterior probability of complete symbol sequences without making any explicit assumption about the shape of the distribution. For this part, experiments on real data are reported.
Index Terms—Recurrent neural networks.
I. INTRODUCTION
A. General
MANY classification and regression problems of engineering interest are currently solved with statistical approaches using the principle of “learning from examples.” For a certain model with a given structure inferred from the prior knowledge about the problem and characterized by a number of parameters, the aim is to estimate these parameters accurately and reliably using a finite amount of training data. In general, the parameters of the model are determined by a supervised training process, whereas the structure of the model is defined in advance. Choosing a proper structure for the model is often the only way for the designer of the system to put in prior knowledge about the solution of the problem.
Artificial neural networks (ANNs) (see [2] for an excellent introduction) are one group of models that take the principle “infer the knowledge from the data” to an extreme. In this paper, we are interested in studying ANN structures for one particular class of problems that are represented by temporal sequences of inputoutput data pairs. For these types of problems, which occur, for example, in speech recognition, time series prediction, dynamic control systems, etc., one of the challenges is to choose an appropriate network structure
Manuscript received June 5, 1997. The associate editor coordinating the review of this paper and approving it for publication was Prof. Jenq-Neng Hwang.
M. Schuster is with the ATR Interpreting Telecommunications Research Laboratory, Kyoto, Japan.
K. K. Paliwal is with the ATR Interpreting Telecommunications Research Laboratory, Kyoto, Japan, on leave from the School of Microelectronic Engineering, Griffith University, Brisbane, Australia.
Publisher Item Identifier S 1053-587X(97)08055-0.
that, at least theoretically, is able to use all available input information to predict a point in the output space.
Many ANN structures have been proposed in the literature to deal with time varying patterns. Multilayer perceptrons (MLPs) have the limitation that they can only deal with static data patterns (i.e., input patterns of a predefined dimensionality), which requires definition of the size of the input window in advance. Waibel et al. [16] have pursued time delay neural networks (TDNNs), which have proven to be a useful improvement over regular MLPs in many applications. The basic idea of a TDNN is to tie certain parameters in a regular MLP structure without restricting the learning capability of the ANN too much. Recurrent neural networks (RNNs) [5], [8], [12], [13], [15] provide another alternative for incorporating temporal dynamics and are discussed in more detail in a later section.
In this paper, we investigate different ANN structures for incorporating temporal dynamics. We conduct a number of experiments using both artificial and real-world data. We show the superiority of RNNs over the other structures. We then point out some of the limitations of RNNs and propose a modified version of an RNN called a bidirectional recurrent neural network, which overcomes these limitations.
B. Technical
Consider a (time) sequence of input data vectors
and a sequence of corresponding output data vectors
with neighboring data-pairs (in time) being somehow statistically dependent. Given time sequences and as training data, the aim is to learn the rules to predict the output data given the input data. Inputs and outputs can, in general, be continuous and/or categorical variables. When outputs are continuous, the problem is known as a regression problem, and when they are categorical (class labels), the problem is known as a classification problem. In this paper, the term prediction is used as a general term that includes regression and classification.
1) Unimodal Regression: For unimodal regression or function approximation, the components of the output vectors are continuous variables. The ANN parameters are estimated to maximize some predefined objective criterion (e.g., maximize the likelihood of the output data). When the distribution of the errors between the desired and the estimated output vectors is assumed to be Gaussian with zero mean and a fixed global data-dependent variance, the likelihood criterion reduces to the
1053587X/97$10.00 © 1997 IEEE
2674
IEEE TRANSACTIONS ON SIGNAL PROCESSING, VOL. 45, NO. 11, NOVEMBER 1997
(a)
(b)
Fig. 1. General structure of a regular unidirectional RNN shown (a) with a delay line and (b) unfolded in time for two time steps.
convenient Euclidean distance measure between the desired
and the estimated output vectors or the mean-squared-error
criterion, which has to be minimized during training [2]. It
has been shown by a number of researchers [2], [9] that
neural networks can estimate the conditional average of the
desired output (or target) vectors at their network outputs, i.e.,
, where is an expectation operator.
2) Classification: In the case of a classification problem,
one seeks the most probable class out of a given pool of
classes for every time frame , given an input vector
sequence . To make this kind of problem suitable to be
solved by an ANN, the categorical variables are usually coded
as vectors as follows. Consider that is the desired class
label for the frame at time . Then, construct an output vector
such that its th component is one and other components
are zero. The output vector sequence constructed in this
manner along with the input vector sequence can be
used to train the network under some optimality criterion,
usually the cross-entropy criterion [2], [9], which results from
a maximum likelihood estimation assuming a multinomial
output distribution. It has been shown [3], [6], [9] that the
th network output at each time point can be interpreted as
an estimate of the conditional posterior probability of class
membership [
] for class , with the
quality of the estimate depending on the size of the training
data and the complexity of the network.
For some applications, it is not necessary to estimate the
conditional posterior probability
of a single
class given the sequence of input vectors but the conditional
posterior probability
of a sequence of
classes given the sequence of input vectors.1
C. Organization of the Paper
This paper is organized in two parts. Given a series of paired
input/output vectors
, we want to
train bidirectional recurrent neural networks to perform the
following tasks.
• Unimodal regression (i.e., compute
) or
classification [i.e., compute
for every output class and decide the class using the maximum a posteriori decision rule]. In this case, the outputs are treated statistically independent. Experiments
1 Here, we want to make
random variable, and ct is
a distinction its value.
between
Ct
and
ct.
Ct
is
a
categorical
for this part are conducted for artificial toy data as well
as for real data.
• Estimation of the conditional probability of a complete
sequence of classes of length using all available input
information [i.e., compute
]. In
this case, the outputs are treated as being statistically
dependent, which makes the estimation more difficult and
requires a slightly different network structure than the one
used in the first part. For this part, results of experiments
for real data are reported.
II. PREDICTION ASSUMING INDEPENDENT OUTPUTS
A. Recurrent Neural Networks
RNNs provide a very elegant way of dealing with (time)
sequential data that embodies correlations between data points
that are close in the sequence. Fig. 1 shows a basic RNN
architecture with a delay line and unfolded in time for two
time steps. In this structure, the input vectors are fed one
at a time into the RNN. Instead of using a fixed number of
input vectors as done in the MLP and TDNN structures, this
architecture can make use of all the available input information
up to the current time frame (i.e.,
)
to predict . How much of this information is captured by
a particular RNN depends on its structure and the training
algorithm. An illustration of the amount of input information
used for prediction with different kinds of NNs is given in
Fig. 2.
Future input information coming up later than is usually
also useful for prediction. With an RNN, this can be partially
achieved by delaying the output by a certain number of time
frames to include future information up to
to predict
(Fig. 2). Theoretically, could be made very large to
capture all the available future information, but in practice,
it is found that prediction results drop if is too large. A
possible explanation for this could be that with rising ,
the modeling power of the RNN is increasingly concentrated
on remembering the input information up to
for the
prediction of , leaving less modeling power for combining
the prediction knowledge from different input vectors.
While delaying the output by some frames has been used
successfully to improve results in a practical speech recogni-
tion system [12], which was also confirmed by the experiments
conducted here, the optimal delay is task dependent and has to
SCHUSTER AND PALIWAL: BIDIRECTIONAL RECURRENT NEURAL NETWORKS
2675
Fig. 2. Visualization of the amount of input information used for prediction by different network structures.
Fig. 3. General structure of the bidirectional recurrent neural network (BRNN) shown unfolded in time for three time steps.
be found by the “trial and error” error method on a validation test set. Certainly, a more elegant approach would be desirable.
To use all available input information, it is possible to use two separate networks (one for each time direction) and then somehow merge the results. Both networks can then be called experts for the specific problem on which the networks are trained. One way of merging the opinions of different experts is to assume the opinions to be independent, which leads to arithmetic averaging for regression and to geometric averaging (or, alternatively, to an arithmetic averaging in the log domain) for classification. These merging procedures are referred to as linear opinion pooling and logarithmic opinion pooling, respectively [1], [7]. Although simple merging of network outputs has been applied successfully in practice [14], it is generally not clear how to merge network outputs in an optimal way since different networks trained on the same data can no longer be regarded as independent.
B. Bidirectional Recurrent Neural Networks
To overcome the limitations of a regular RNN outlined in the previous section, we propose a bidirectional recurrent neural network (BRNN) that can be trained using all available input information in the past and future of a specific time frame.
1) Structure: The idea is to split the state neurons of a regular RNN in a part that is responsible for the positive time direction (forward states) and a part for the negative time
direction (backward states). Outputs from forward states are
not connected to inputs of backward states, and vice versa.
This leads to the general structure that can be seen in Fig. 3,
where it is unfolded over three time steps. It is not possible to
display the BRNN structure in a figure similar to Fig. 1 with
the delay line since the delay would have to be positive and
negative in time. Note that without the backward states, this
structure simplifies to a regular unidirectional forward RNN,
as shown in Fig. 1. If the forward states are taken out, a
regular RNN with a reversed time axis results. With both time
directions taken care of in the same network, input information
in the past and the future of the currently evaluated time frame
can directly be used to minimize the objective function without
the need for delays to include future information, as for the
regular unidirectional RNN discussed above.
2) Training: The BRNN can principally be trained with
the same algorithms as a regular unidirectional RNN because
there are no interactions between the two types of state
neurons and, therefore, can be unfolded into a general feed-
forward network. However, if, for example, any form of
back-propagation through time (BPTT) is used, the forward
and backward pass procedure is slightly more complicated
because the update of state and output neurons can no longer
be done one at a time. If BPTT is used, the forward and
backward passes over the unfolded BRNN over time are done
almost in the same way as for a regular MLP. Some special
treatment is necessary only at the beginning and the end of
the training data. The forward state inputs at
and the
2676
IEEE TRANSACTIONS ON SIGNAL PROCESSING, VOL. 45, NO. 11, NOVEMBER 1997
backward state inputs at
are not known. Setting these
could be made part of the learning process, but here, they
are set arbitrarily to a fixed value (0.5). In addition, the local
state derivatives at
for the forward states and at
for the backward states are not known and are set here to
zero, assuming that the information beyond that point is not
important for the current update, which is, for the boundaries,
certainly the case. The training procedure for the unfolded
bidirectional network over time can be summarized as follows.
1) FORWARD PASS
Run all input data for one time slice
through
the BRNN and determine all predicted outputs.
a) Do forward pass just for forward states (from
to
) and backward states (from
to ).
b) Do forward pass for output neurons.
2) BACKWARD PASS
Calculate the part of the objective function derivative
for the time slice
used in the forward pass.
a) Do backward pass for output neurons.
b) Do backward pass just for forward states (from
to
) and backward states (from
to
).
3) UPDATE WEIGHTS
C. Experiments and Results
In this section, we describe a number of experiments with the goal of comparing the performance of the BRNN structure with that of other structures. In order to provide a fair comparison, we have used different structures with a comparable number of parameters as a rough complexity measure. The experiments are done for artificial data for both regression and classification tasks with small networks to allow extensive experiments and for real data for a phoneme classification task with larger networks.
1) Experiments with Artificial Data: a) Description of Data: In these experiments, an artifi-
cial data set is used to conduct a set of regression and classification experiments. The artificial data is generated as follows. First, a stream of 10 000 random numbers between zero and one is created as the one-dimensional (1-D) input data to the ANN. The 1-D output data (the desired output) is obtained as the weighted sum of the inputs within a window of 10 frames to the left and 20 frames to the right with respect to the current frame. The weighting falls of linearly on both sides as
The weighting procedure introduces correlations between neighboring input/output data pairs that become less for data pairs further apart. Note that the correlations are not symmetrical, being on the right side of each frame, which is twice as “broad” as on the left side. For the classification
TABLE I DETAILS OF REGRESSION AND CLASSIFICATION ARCHITECTURES EVALUATED IN OUR EXPERIMENTS
experiments, the output data is mapped to two classes, with class 0 for all output values below (or equal to) 0.5 and class 1 for all output values above 0.5, giving approximately 59% of the data to class 0 and 41% to class 1.
b) Experiments: Separate experiments are conducted for regression and classification tasks. For each task, four different architectures are tested (Table I). Type “MERGE” refers to the merged results of type RNN-FOR and RNN-BACK because they are regular unidirectional recurrent neural networks trained in the forward and backward time directions, respectively. The first three architecture types are also evaluated over different shifts of the output data in the positive time direction, allowing the RNN to use future information, as discussed above.
Every test (ANN training/evaluation) is run 100 times with different initializations of the ANN to get at least partially rid of random fluctuations of the results due to convergence to local minima of the objective function. All networks are trained with 200 cycles of a modified version of the resilient propagation (RPROP) technique [10] and extended to a RPROP through a time variant. All weights in the structure are initialized in the range ( ) drawn from the uniform distribution, except the output biases, which are set so that the corresponding output gives the prior average of the output data in case of zero input activation.
For the regression experiments, the networks use the activation function and are trained to minimize the meansquared-error objective function. For type “MERGE,” the arithmetic mean of the network outputs of “RNN-FOR” and “RNN-BACK” is taken, which assumes them to be independent, as discussed above for the linear opinion pool.
For the classification experiments, the output layer uses the “softmax” output function [4] so that outputs add up to one and can be interpreted as probabilities. As commonly used for ANNs to be trained as classifiers, the cross-entropy objective function is used as the optimization criterion. Because the outputs are probabilities assumed to be generated by independent events, for type “MERGE,” the normalized geometric mean (logarithmic opinion pool) of the network outputs of “RNN-FOR” and “RNN-BACK” is taken.
c) Results: The results for the regression and the classification experiments averaged over 100 training/evaluation runs can be seen in Figs. 4 and 5, respectively. For the regression task, the mean squared error depending on the shift of the output data in positive time direction seen from the time axis of the network is shown. For the classification task, the recognition rate, instead of the mean value of the objective function (which would be the mean cross-entropy), is shown
SCHUSTER AND PALIWAL: BIDIRECTIONAL RECURRENT NEURAL NETWORKS
2677
Fig. 4. Averaged results (100 runs) for the regression experiment on artificial data over different shifts of the output data with respect to the input data in future direction (viewed from the time axis of the corresponding network) for several structures.
because it is a more familiar measure to characterize results of classification experiments.
Several interesting properties of RNNs in general can be directly seen from these figures. The minimum (maximum) for the regression (classification) task should be at 20 frames delay for the forward RNN and at 10 frames delay for the backward RNN because at those points, all information for a perfect regression (classification) has been fed into the network. Neither is the case because the modeling power of the networks given by the structure and the number of free parameters is not sufficient for the optimal solution. Instead, the single time direction networks try to make a tradeoff between “remembering” the past input information, which is useful for regression (classification), and “knowledge combining” of currently available input information. This results in an optimal delay of one (two) frame for the forward RNN and five (six) frames for the backward RNN. The optimum delay is larger for the backward RNN because the artificially created correlations in the training data are not symmetrical with the important information for regression (classification) being twice as dense on the left side as on the right side of each frame. In the case of the backward RNN, the time series is evaluated from right to left with the denser information coming up later. Because the denser information can be evaluated easier (fewer parameters are necessary for a contribution to the objective function minimization), the optimal delay is larger for the backward RNN. If the delay is so large that almost no important information can be saved over time, the network converges to the best possible solution based only on prior information. This can be seen for the
classification task with the backward RNN, which converges to 59% (prior of class 0) for more than 15 frames delay.
Another sign for the tradeoff between “remembering” and “knowledge combining” is the variation in the standard deviation of the results, which is only shown for the backward RNN in the classification task. In areas where both mechanisms could be useful (a 3 to 17 frame shift), different local minima of the objective function correspond to a certain amount to either one of these mechanisms, which results in larger fluctuations of the results than in areas where “remembering” is not very useful ( 5 to 3 frame shift) or not possible (17 to 20 frame shift).
If the outputs of forward and backward RNNs are merged so that all available past and future information for regression (classification) is present, the results for the delays tested here ( 2 to 10) are, in almost all cases, better than with only one network. This is no surprise because besides the use of more useful input information, the number of free parameters for the model doubled.
For the BRNN, it does not make sense to delay the output data because the structure is already designed to cope with all available input information on both sides of the currently evaluated time point. Therefore, the experiments for the BRNN are only run for SHIFT . For the regression and classification tasks tested here, the BRNN clearly performs better than the network “MERGE” built out of the single time-direction networks “RNN-FOR” and “RNN-BACK,” with a comparable number of total free parameters.
2) Experiments with Real Data: The goal of the experiments with real data is to compare different ANN structures
2678
IEEE TRANSACTIONS ON SIGNAL PROCESSING, VOL. 45, NO. 11, NOVEMBER 1997
Fig. 5. Averaged results for the classification experiment on artificial data.
for the classification of phonemes from the TIMIT speech database. Several regular MLPs and recurrent neural network architectures, which make use of different amounts of acoustic context, are tested here.
a) Description of Data: The TIMIT phoneme database is a well-established database consisting of 6300 sentences spoken by 630 speakers (ten sentences per speaker). Following official TIMIT recommendations, two of the sentences (which are the same for every speaker) are not included in our experiments, and the remaining data set is divided into two sets: 1) the training data set consisting of 3696 sentences from 462 speakers and 2) the test data set consisting of 1344 sentences from 168 speakers. The TIMIT database provides hand segmentation of each sentence in terms of phonemes and a phonemic label for every segment out of a pool of 61 phonemes. This gives 142 910 phoneme segments for training and 51 681 for testing.
In our experiments, every sentence is transformed into a vector sequence using three levels of feature extraction. First, features are extracted every frame to represent the raw waveform in a compressed form. Then, with the knowledge of the boundary locations from the corresponding label files, segment features are extracted to map the information from an arbitrary length segment to a fixed-dimensional vector. A third transformation is applied to the segment feature vectors to make them suitable as inputs to a neural net. These three steps are briefly described below.
1) Frame Feature Extraction: As frame features, 12 regular MFCCs (from 24 mel-space frequency bands) plus the log-energy are extracted every 10 ms with a 25.6-ms
Hamming window and a preemphasis of 0.97. This is a commonly used feature extraction procedure for speech signals at the frame level [17]. 2) Segment Feature Extraction: From the frame features, the segment features are extracted by dividing the segment in time into five equally spaced regions and computing the area under the curve in each region, with the function values between the data points linearly interpolated. This is done separately for each of the 13 frame features. The duration of the segment is used as an additional segment feature. This results in a 66dimensional segment feature vector. 3) Neural Network Preprocessing: Although ANNs can principally handle any form of input distributions, we have found in our experiments that the best results are achieved with Gaussian input distributions, which matches the experiences from [12]. To generate an “almost-Gaussian distribution,” the inputs are first normalized to zero mean and unit variance on a sentence basis, and then, every feature of a given channel2 is quantized using a scalar quantizer having 256 reconstruction levels (1 byte). The scalar quantizer is designed to maximize the entropy of the channel for the whole training data. The maximum entropy scalar quantizer can be easily designed for each channel by arranging the channel points in ascending order according to their feature values and putting (almost) an equal number of
2 Here, each vector has a dimensionality of 66. Temporal sequence of each component (or feature) of this vector defines one channel. Thus, we have here 66 channels.
SCHUSTER AND PALIWAL: BIDIRECTIONAL RECURRENT NEURAL NETWORKS
2679
TABLE II
 TIMIT PHONEME CLASSIFICATION RESULTS FOR FULL
TRAINING AND TEST DATA SETS WITH 13 000 PARAMETERS
backward RNNs (FOR-RNN, BACK-RNN), making use of input information only on one side of the current segment, give lower recognition rates (63.2 and 61.91%) than the forward RNN with one segment delay (65.83%). With a two segment delay, too much information has to be saved over time, and the result drops to 63.27% (FOR-RNN, two delay), although theoretically, more input information than for the previous network is present. The merging of the outputs of two separate networks (MERGE) trained in each time direction gives a recognition rate of 65.28% and is worse than the forward RNN structure using one segment delay. The bidirectional recurrent neural network (BRNN) structure results in the best performance (68.53%).
channel points in each quantization cell. For presentation
to the network, the byte-coded value is remapped with
value erf
byte
, where erf is
the inverse error function [erf is part of math.h library
in C]. This mapping produces on average a distribution
that is similar to a Gaussian distribution.
The feature extraction procedure described above transforms every sentence into a sequence of fixed dimensional vectors representing acoustic phoneme segments. The sequence of these segment vectors (along with their phoneme class labels) are used to train and test different ANN structures for classification experiments, as described below.
b) Experiments: Experiments are performed here with different ANN structures (e.g., MLP, RNN, and BRNN), which allow the use of different amounts of acoustic context. The MLP structure is evaluated for three different amounts of acoustic context as input.
1) one segment; 2) three segments (middle, left, and right); 3) five segments (middle, two left, and two right).
The evaluated RNN structures are unidirectional forward and backward RNNs that use all acoustic context on one side, two forward RNNs with one and two segment delays to incorporate right-hand information, the merged network built out of the unidirectional forward and backward RNNs, and the BRNN. The structures of all networks are adjusted so that each of them has about the same number of free parameters (approximately 13 000 here).
c) Results: Table II shows the phoneme classification results for the full training and test set. Although the database is labeled to 61 symbols, a number of researchers have chosen to map them to a subset of 39 symbols. Here, results are given for both versions, with the results for 39 symbols being simply a mapping from the results obtained for 61 symbols. Details of this standard mapping can be found in [11].
The baseline performance assuming neighboring segments to be independent gives 59.67% recognition rate (MLP-1) on the test data. If three consecutive segments are taken as the inputs (MLP-3), loosening the independence assumption to three segments, the recognition rate goes up to 65.69%. Using five segments (MLP-5), the structure is not flexible enough to make use of the additional input information, and as a result, the recognition rate drops to 64.32%. The forward and
III. PREDICTION ASSUMING DEPENDENT OUTPUTS
In the preceding section, we have estimated the conditional
posterior probability
of a single class at
a certain time point , given the sequence of input vectors
. For some applications, it is necessary to estimate the
conditional posterior probability
of a
sequence of all classes from
to
instead of
, given the sequence of input vectors. This is a
difficult problem, and no general practical solution is known,
although this type of estimation is essential for many pattern
recognition applications where sequences are involved.
A. Approach
Bidirectional recurrent neural networks can provide an
approach to estimate
. Using the rule
, we decompose the sequence posterior
probability as
backward posterior probability
forward posterior probability
The probability term within the product is the conditional
probability of an output class given all the input to the
right- and left-hand side plus the class sequence on one side
of the currently evaluated input vector. The two ways of
decomposing
(many more are possi-
ble) are here referred to as the forward and the backward
posterior probabilities. Note that these decompositions are only
a simple application of probability rules, i.e., no assumptions
concerning the shape of the distributions is made.
In the present approach, the goal is to train a net-
work to estimate conditional probabilities of the kind
(which are the probability terms
in the products). The estimates for these probabilities can then
be combined by using the formulas above to estimate the full
conditional probability of the sequence. It should be noted
2680
IEEE TRANSACTIONS ON SIGNAL PROCESSING, VOL. 45, NO. 11, NOVEMBER 1997
Fig. 6. Modified bidirectional recurrent neural network structure shown here with extensions for the forward posterior probability estimation.
that the forward and the backward posterior probabilities are
exactly equal, provided the probability estimator is perfect.
However, if neural networks are used as probability estimators,
this will rarely be the case because different architectures
or different local minima of the objective function to be
minimized correspond to estimators of different performance.
It might therefore be useful to combine several estimators
to get a better estimate of the quantity of interest using
the methods of the previous section. Two candidates that
could be merged here are
and
at each time point .
B. Modified Bidirectional Recurrent Neural Networks
A slightly modified BRNN structure can efficiently be
used to estimate the conditional probabilities of the kind
, which is conditioned on continu-
ous
and discrete inputs
. Assume that
the input for a specific time is coded as one long vector
containing the target output class and the original input
vector with, for example, the discrete input coded in
the first dimensions of the whole input vector. To make the
BRNN suitable to estimate
, two
changes are necessary. First, instead of connecting the forward
and backward states to the current output states, they are
connected to the next and previous output states, respectively,
and the inputs are directly connected to the outputs. Second,
if in the resulting structure the first weight connections
from the inputs to the backward states and the inputs to
the outputs are cut, then only discrete input information
from
can be used to make predictions. This is
exactly what is required to estimate the forward posterior
probability
. Fig. 6 illustrates this
change of the original BRNN architecture. Cutting the input
connections to the forward states instead of the backward states
gives the architecture for estimating the backward posterior
probability. Theoretically, all discrete and continuous inputs
that are necessary to estimate the prob-
ability are still accessible for a contribution to the prediction.
During training, the bidirectional structure can adapt to the best
possible use of the input information, as opposed to structures
that do not provide part of the input information because of the
limited size of the input windows (e.g., in MLP and TDNN)
or one-sided windows (unidirectional RNN).
TABLE III CLASSIFICATION RESULTS FOR FULL TIMIT TRAINING AND TEST DATA WITH 61 (39) SYMBOLS
C. Experiments and Results
1) Experiments: Experiments are performed using the full
TIMIT data set. To include the output (target) class in-
formation, the original 66-dimensional feature vectors are
extended to 72 dimensions. In the first six dimensions, the
corresponding output class is coded in a binary format (binary
[0, 1] network input [ ). Two different structures
of the modified BRNN (one for the forward and the other
for the backward posterior probability) are trained separately
as classifiers using the cross-entropy objective function. The
output neurons have the softmax activation function and the
remaining ones the
activation function. The forward
(backward) modified BRNN has 64 (32) forward and 32
(64) backward states. Additionally, 64 hidden neurons are
implemented before the output layer. This results in a forward
(backward) modified BRNN structure with 26 333 weights.
These two structures, as well as their combination—merged
as a linear and a logarithmic opinion pool—are evaluated for
phoneme classification on the test data.
2) Results: The results for the phoneme classification task
are shown in Table III. It can be seen that the combination of
the forward and backward modified BRNN structures results
in much better performance than the individual structures. This
shows that the two structures, even though they are trained on
the same training data set to compute the same probability
, are providing different estimates of
this probability, and as a result, the combination of the two
networks is giving better results. The slightly better results for
the logarithmic opinion pool with respect to the linear opinion
pool suggest that it is reasonable to assume the two estimates
for the probability
as independent,
although the two structures are trained on the same data set.
It should be noted that the modified BRNN structure is only
a tool to estimate the conditional probability of a given class
SCHUSTER AND PALIWAL: BIDIRECTIONAL RECURRENT NEURAL NETWORKS
2681
sequence and that it does not provide a class sequence with the highest probability. For this, all possible class sequences have to be searched to get the most probable class sequence (which is a procedure that has to be followed if one is interested in a problem like continuous speech recognition). In the experiments reported in this section, we have used the class sequence provided by the TIMIT data base. Therefore, the context on the (right or left) output side is known and is correct.
IV. DISCUSSION AND CONCLUSION
In the first part of this paper, a simple extension to a regular recurrent neural network structure has been presented, which makes it possible to train the network in both time directions simultaneously. Because the network concentrates on minimizing the objective function for both time directions simultaneously, there is no need to worry about how to merge outputs from two separate networks. There is also no need to search for an “optimal delay” to minimize the objective function in a given data/network structure combination because all future and past information around the currently evaluated time point is theoretically available and does not depend on a predefined delay parameter. Through a series of extensive experiments, it has been shown that the BRNN structure leads to better results than the other ANN structures. In all these comparisons, the number of free parameters has been kept to be approximately the same. The training time for the BRNN is therefore about the same as for the other RNNs. Since the search for an optimal delay (an additional search parameter during development) is not necessary, the BRNNs can provide, in comparison to other RNNs investigated in this paper, faster development of real applications with better results.
In the second part of this paper, we have shown how to use slightly modified bidirectional recurrent neural nets for the estimation of the conditional probability of symbol sequences without making any explicit assumption about the shape of the output probability distribution. It should be noted that the modified BRNN structure is only a tool to estimate the conditional probability of a given class sequence; it does not provide the class sequence with the highest probability. For this, all possible class sequences have to be searched to get the most probable class sequence. We are currently working on designing an efficient search engine, which will use only ANNs to find the most probable class sequence.
REFERENCES
[1] J. O. Berger, Statistical Decision Theory and Bayesian Analysis. Berlin, Germany: Springer-Verlag, 1985.
[2] C. M. Bishop, Neural Networks for Pattern Recognition. Oxford, U.K.: Clarendon, 1995.
[3] H. Bourlard and C. Wellekens, “Links between Markov models and multilayer perceptrons,” IEEE Trans. Pattern Anal. Machine Intell., vol. 12, pp. 11671178, Dec. 1990.
[4] J. S. Bridle, “Probabilistic interpretation of feed-forward classification network outputs, with relationships to statistical pattern recognition,” in Neurocomputing: Algorithms, Architectures and Applications, F. Fougelman-Soulie and J. Herault, Eds. Berlin, Germany: SpringerVerlag, 1989, NATO ASI Series, vol. F68, pp. 227236.
[5] C. L. Giles, G. M. Kuhn, and R. J. Williams, “Dynamic recurrent neural
networks: Theory and applications,” IEEE Trans. Neural Networks, vol.
5, pp. 153156, Apr. 1994.
[6] H. Gish, “A probabilistic approach to the understanding and training
of neural network classifiers,” in Proc. IEEE Int. Conf. Acoust., Speech,
Signal Process., 1990, pp. 13611364.
[7] R. A. Jacobs, “Methods for combining experts probability assessments,”
Neural Comput., vol. 7, no. 5, pp. 867888, 1995.
[8] B. A. Pearlmutter, “Learning state space trajectories in recurrent neural
networks,” Neural Comput., vol. 1, pp. 263269, 1989.
[9] M. D. Richard and R. P. Lippman, “Neural network classifiers estimate
Bayesian a posteriori probabilities,” Neural Comput., vol. 3, no. 4, pp.
461483, 1991.
[10] M. Riedmiller and H. Braun, “A direct adaptive method for faster back-
propagation learning: The RPROP algorithm,” in Proc. IEEE Int. Conf.
Neural Networks, 1993, pp. 586591.
[11] T. Robinson, “Several improvements to a recurrent error propagation
network phone recognition system,” Cambridge Univ. Eng. Dept. Tech.
Rep. CUED/F-INFENG/TR82, Sept. 1991.
[12] A. J. Robinson, “An application of recurrent neural nets to phone
probability estimation,” IEEE Trans. Neural Networks, vol. 5, pp.
298305, Apr. 1994.
[13] T. Robinson, M. Hochberg, and S. Renals, “The use of recurrent
neural networks in continuous speech recognition,” in Automatic Speech
Recognition: Advanced Topics, C. H. Lee, F. K. Soong, and K. K.
Paliwal, Eds. Boston, MA: Kluwer, 1996, pp. 233258.
[14]
, “Improved phone modeling with recurrent neural networks,” in
Proc. IEEE Int. Conf. Acoust., Speech, Signal Process., vol. 1, 1994, pp.
3740.
[15] D. E. Rumelhart, G. E. Hinton, and R. J. Williams, “Learning internal
representations by error backpropagation,” in Parallel Distributed Pro-
cessing, vol. 1, D. E. Rumelhart and J. L. McClelland, Eds. Cambridge,
MA: MIT Press, 1986, pp. 318362.
[16] A. Waibel, T. Hanazawa, G. Hinton, K. Shikano, and K. J. Lang,
“Phoneme recognition using time-delay neural networks,” IEEE Trans.
Acoust., Speech, Signal Processing, vol. 37, pp. 328339, Mar. 1989.
[17] S. Young, “A review of large vocabulary speech recognition,” IEEE
Signal Processing Mag., vol. 15, pp. 4557, May 1996.
Mike Schuster received the M.Sc. degree in electronic engineering in 1993 from the Gerhard Mercator University, Duisburg, Germany. Currently, he is also working toward the Ph.D. degree at the Nara Institute of Technology, Nara, Japan.
After doing some research in fiber optics at the University of Tokyo, Tokyo, Japan, and some research in gesture recognition in Duisburg, he started at Advanced Telecommunication Research (ATR), Kyoto, Japan, to work on speech recognition. His research interests include neural networks and stochastic modeling in general, Bayesian approaches, information theory, and coding.
Kuldip K. Paliwal (M89) is a Professor and Chair of Communication/Information Engineering at Griffith University, Brisbane, Australia. He has worked at a number organizations, including the Tata Institute of Fundamental Research, Bombay, India, the Norwegian Institute of Technology, Trondheim, Norway, the University of Keele, U.K., AT&T Bell Laboratories, Murray Hill, NJ, and Advanced Telecommunication Research (ATR) Laboratories, Kyoto, Japan. He has co-edited two books: Speech Coding and Synthesis (New York: Elsevier, 1995) and Speech and Speaker Recognition: Advanced Topics (Boston, MA: Kluwer, 1996). His current research interests include speech processing, image coding, and neural networks. Dr. Paliwal received the 1995 IEEE Signal Processing Society Senior Award. He is an Associate Editor of the IEEE TRANSACTIONS ON SPEECH AND AUDIO PROCESSING.
View publication stats

View File

@@ -0,0 +1,16 @@
Title: Bidirectional Recurrent Neural Networks - Signal Processing, IEEE Transactions on
Subject:
Keywords:
Author: IEEE
Creator: IEEE Copyright
Producer: CVISION Technologies' PDFCompressor 2.0
CreationDate: 08/03/98 11:15:13
ModDate: 07/24/02 11:35:25
Tagged: no
Form: none
Pages: 10
Encrypted: no
Page size: 612 x 792 pts (letter) (rotated 0 degrees)
File size: 221633 bytes
Optimized: no
PDF version: 1.4

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
Title: Extended Tables of the Wilcoxon Matched Pair Signed Rank Statistic
Creator: Acrobat 5.0 Paper Capture Plug-in for Windows
Producer: Acrobat 4.0 Import Plug-in for Windows
CreationDate: 03/16/12 09:37:10
ModDate: 03/21/12 16:35:18
Tagged: no
Form: none
Pages: 9
Encrypted: no
Page size: 634.32 x 833.04 pts (rotated 0 degrees)
File size: 641008 bytes
Optimized: no
PDF version: 1.4

View File

@@ -0,0 +1 @@
{"pageIndex":0,"scale":"page-width","top":833,"left":-7,"scrollMode":0,"spreadMode":0}

View File

@@ -0,0 +1,754 @@
Sanz et al. BMC Bioinformatics (2018) 19:432 https://doi.org/10.1186/s12859-018-2451-4
METHODOLOGY ARTICLE
Open Access
SVM-RFE: selection and visualization of the most relevant features through non-linear kernels
Hector Sanz1* , Clarissa Valim2,3, Esteban Vegas1, Josep M. Oller1 and Ferran Reverter1,4
Abstract
Background: Support vector machines (SVM) are a powerful tool to analyze data with a number of predictors approximately equal or larger than the number of observations. However, originally, application of SVM to analyze biomedical data was limited because SVM was not designed to evaluate importance of predictor variables. Creating predictor models based on only the most relevant variables is essential in biomedical research. Currently, substantial work has been done to allow assessment of variable importance in SVM models but this work has focused on SVM implemented with linear kernels. The power of SVM as a prediction model is associated with the flexibility generated by use of non-linear kernels. Moreover, SVM has been extended to model survival outcomes. This paper extends the Recursive Feature Elimination (RFE) algorithm by proposing three approaches to rank variables based on non-linear SVM and SVM for survival analysis.
Results: The proposed algorithms allows visualization of each one the RFE iterations, and hence, identification of the most relevant predictors of the response variable. Using simulation studies based on time-to-event outcomes and three real datasets, we evaluate the three methods, based on pseudo-samples and kernel principal component analysis, and compare them with the original SVM-RFE algorithm for non-linear kernels. The three algorithms we proposed performed generally better than the gold standard RFE for non-linear kernels, when comparing the truly most relevant variables with the variable ranks produced by each algorithm in simulation studies. Generally, the RFE-pseudo-samples outperformed the other three methods, even when variables were assumed to be correlated in all tested scenarios.
Conclusions: The proposed approaches can be implemented with accuracy to select variables and assess direction and strength of associations in analysis of biomedical data using SVM for categorical or time-to-event responses. Conducting variable selection and interpreting direction and strength of associations between predictors and outcomes with the proposed approaches, particularly with the RFE-pseudo-samples approach can be implemented with accuracy when analyzing biomedical data. These approaches, perform better than the classical RFE of Guyon for realistic scenarios about the structure of biomedical data.
Keywords: Support vector machines, Relevant variables, Recursive feature elimination, Kernel methods
* Correspondence: hsrodenas@gmail.com 1Department of Genetics, Microbiology and Statistics, Faculty of Biology, Universitat de Barcelona, Diagonal, 643, 08028 Barcelona, Catalonia, Spain Full list of author information is available at the end of the article
© The Author(s). 2018 Open Access This article is distributed under the terms of the Creative Commons Attribution 4.0 International License (http://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made. The Creative Commons Public Domain Dedication waiver (http://creativecommons.org/publicdomain/zero/1.0/) applies to the data made available in this article, unless otherwise stated.
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 2 of 18
Background Analysis of investigations aiming to classify or predict response variables in biomedical research oftentimes is challenging because of data sparsity generated by limited sample sizes and a moderate or very large number of predictors. Moreover, in biomedical research, it is particularly relevant to learn about the relative importance of predictors to shed light in mechanisms of association or to save costs when developing biomarkers and surrogates. Each marker included in an assay increases the price of the biomarker and several technologies used to measure biomarkers can accommodate a limited number of markers. Support Vector Machine (SVM) models are a powerful tool to identify predictive models or classifiers, not only because they accommodate well sparse data but also because they can classify groups or create predictive rules for data that cannot be classified by linear decision functions. In spite of that, SVM has only recently became popular in the biomedical literature, partially because SVMs are complex and partially because SVMs were originally geared towards creating classifiers based on all available variables, and did not allow assessing variable importance.
Currently, there are three categories of methods to assess importance of variables in SVM: filter, wrapper, and embedded methods. The problem with the existing approaches within these three categories is that they are mainly based on SVM with linear kernels. Therefore, the existing methods do not allow implementing SVM in data that cannot be classified by linear decision functions. The best approaches to work with non-linear kernels are wrapper methods because filter methods are less efficient than wrapper methods and embedded methods are focused on linear kernels. The gold standard of wrapper methods is recursive feature elimination (RFE) proposed by Guyon et al. [1]. Although wrapper methods outweigh other procedures, there is no approach implemented to visualize RFE results. The RFE algorithm for non-linear kernels allows ranking variables but not comparing the performance of all variables in a specific iteration, i.e., interpreting results in terms of: association with the response variable, association with the other variables and magnitude of this association, which is a key point in biomedical research. Moreover, previous work with the RFE algorithm for non-linear kernels has generally focused on classification and disregarded time-to-event responses with censoring that are common in biomedical research.
The work presented in this article expands RFE to visualize variable importance in the context of SVM with non-linear kernels and SVM for survival responses. More specifically, we propose: i) a RFE-based algorithm that allows visualization of variable importance by plotting the
predictions of the SVM model; and ii) two variants from the RFE-algorithms based on representation of variables into a multidimensional space such as the KPCA space. In the first section, we briefly review existing methods to evaluate importance of variables by ranking, by selecting variables, and by allowing visualization of variable relative importance. In the Methods section, we present our proposed approaches and extensions. Next, in Results, we evaluate the proposed approaches using simulated data and three real datasets. Finally, we discuss the main characteristics and obtained results of all three proposed methods.
Existing approaches to assess variable importance The approaches to assess variable importance in SVM can be grouped in filter, embedded and wrapper method classes. Filter methods assess the relevance of variables by looking only at the intrinsic properties of the data without taking into account any information provided by the classification algorithm. In other words, they perform variable selection before fitting the learning algorithm. In most cases, a variable relevance score is calculated, and low-scoring variables are removed. Afterwards, the “relevant” variable subset is input into the classification algorithm. Filter methods include the F-score [2, 3].
Embedded methods, are built into a classifier and, thus, are specific to a given learning algorithm. In the SVM framework, all embedded methods are limited to linear kernels. Additionally, most of these methods are based on a somewhat penalization term, i.e., variables are penalized depending on their values with some methods explicitly constraining the number of variables, and others penalizing the number of variables [4, 5]. An additional exact algorithm was developed for SVM in classification problems using the Benders decomposition algorithm [6]. Finally, a penalized version of the SVM with different penalization terms was suggested by Becker et al. [7, 8]
Wrapper methods evaluate a specific subset of variables by training and testing a specific classification model, and are thus, tailored to a specific classification algorithm. The idea is to search the space of all variable subsets with an algorithm wrapped around the classification model. However, as the space of variables subset grows exponentially with the number of variables, heuristic search methods are used to guide the search for an optimal subset. Guyon et al. [1] proposed one of the most popular wrapper approaches for variable selection in SVM. The method is known as SVM-Recursive Feature Elimination (SVM-RFE) and, when applied to a linear kernel, the algorithm is based on the steps shown in Fig. 1. The final output of this algorithm is a ranked list with variables ordered according to their relevance. In the same paper, the authors proposed an approximation for non-linear
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 3 of 18
Fig. 1 Pseudo-code of the SVM-RFE algorithm using the linear kernel in a model for binary classification
kernels. The idea is based on measuring the smallest change in the cost function by assuming no change in the value of the estimated parameters in the optimization problem. Thus, one avoids to retrain a classifier for every candidate variable to be eliminated.
SVM-RFE method is basically a backward elimination procedure. However, the variables that are top ranked (eliminated last) are not necessarily the ones that are individually most relevant but the most relevant conditional on the specific ranked subset in the model. Only taken together the variables of a subset are optimal in some sense. So for instance, if we are focusing on a variable that is p ranked we know that in the model with the 1 to p ranked variables, p is the variable least relevant.
The wrapper approaches include the interaction between variable subset search and model selection as well as the ability to take into account variable correlations. A common drawback of these techniques is that they have a higher risk of overfitting than filter methods and are computationally intensive, especially if building the classifier has a high computational cost [9]. Additional work has been done to assess variable importance in non-linear kernels SVM by modifying SVM-RFE [3, 10, 11].
The methods we propose in the next section are based on a wrapper approach, specifically in the RFE algorithm, allowing visualization and interpretation of
the relevant variables in each RFE iteration using linear or non-linear kernels and fitting SVM extensions such as SVM for survival analysis,
Methods
RFE-pseudo-samples One of our proposed methods follows and extends the idea proposed in Krooshof et al. [12] and Postma et al. [13] to visualize the importance of variables using pseudo-samples in the kernel partial least squares and the support vector regression (SVR) context, respectively. The proposed is applicable to SVM classifying binary outcomes. Briefly, the main steps are the following:
1. Optimize the SVM method and tune the parameters.
2. For each variable of interest, create a pseudosamples matrix with equally distanced values zfrom the original variable, while maintaining the other variables set to their mean or median (1). zq can be quantiles of the variable for an arbitrary q that is the number of selected quantiles. As the data is usually normalized, we assume that the mean is 0. There will be p pseudo-samples matrices of dimension q x p. For instance, for variable 1, the pseudo-sample matrix will look like in (1) with q pseudo-samples vectors.
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 4 of 18
0
1
V1
BBBBBB@
z1 z2 z3
zq
V2 0 0 0
0
V3 Vp 0 …0 0 …0 0 …0 ⋮ 0 …0
CCCCCCA
pseudosamples1 pseudosamples2 pseudosamples3
pseudosamplesq
constant c is equal to 1.4826, and it is incorporated in the expression to ensure consistency in terms of ð1Þ expectation so that
EðMADðD1; …; DnÞÞ ¼ σ
3. Obtain the predicted decision value (not the predicted class) from SVM (a real negative or positive value) for each pseudo-sample using the SVM model fitted in step 1. Basically, this decision value corresponds to the distance of each observation from the SVM margins.
4. Measure the variability of each variables prediction using the univariate robust metric median absolute deviation (MAD). This mesure is expressed for a given variable p as
MADp ¼ medianðjDqpmedianðDpÞjÞc
being Dqp the decision value of the variable p for the pseudo-sample q and being median(Dp) the median of all decision values for the evaluated variable p. The
for Di distributed as N(μ, σ2) and large n [14, 15].
5. Remove the variable with the lowest MAD value. 6. Repeat steps 25 until there is only one variable left
(applying in this way the RFE algorithm as detailed in Fig. 2).
The rationale of the proposed method is that for variables associated with the response, modifications in the variable will affect predictions. On the contrary, for variables not associated with the response, changes in the variable value will not affect predictions and the decision value will be approximately constant. Therefore, since the decision value can be used as a score that measure distance to the hyperplane, the larger the absolute value the more confident we are that the observation belongs to the predicted class defined by the sign.
Fig. 2 Pseudo-code of the RFE-pseudo-samples algorithm applied to a time-to-event (right-censored) response variable
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 5 of 18
Visualization of variables The RFE-pseudo-samples algorithm allows us to plot the decision values and the range of all variables, in this way we account for:
 Strenght and direction of the association between individual variables and the response: since we are plotting the range of the variable and the decision value, we are able to detect whether larger values of the variable are protective or risk factors.
 The proposed method fix the values of the nonevaluated variables to 0 but this can be modified to evaluate the performance of the desired variables fixing the values to any other biologically meaningful value.
 The distribution of the data can be indicative of the type of association of each variable with respect the response, i.e., U-shaped, linear or exponential, for example.
 The variability on the decision values can be indicative of the relevance of the variable with the response. Given a variable, the more variability on the decision values along its range the more associated is the variable with the response.
RFE-kernel principal components input variables Reverter et al. [16] proposed a method using the kernel principal component analysis (KPCA) space (more detail on the KPCA methodology in Additional file 1) to represent, for each variable, the direction of maximum growth locally. So, given two leading components the maximum growth for each variable is indicated in a plot in which each axis is one of the components. After representing all observations in the new space, if a variable is relevant under this context will show a clear direction across all samples and if its not the samples direction will be random. In the same work the authors suggest to incorporate functions of the original variables into the KPCA space, so its possible to plot not only growth of individual variables but combination of them if makes sense within the research study. Our proposed method, referred as RFE-KPCA-maxgrowth, consists of the following steps:
1. Fit the SVM. 2. Create the KPCA space using the tuned parameters
found in the SVM process with all variables if possible, for example, when the kernel used in SVM is the same than in KPCA. 3. Represent the observations with respect the two first components of the KPCA. 4. Compute and represent the input variables and the decision function of the SVM into the KPCA
output, as detailed in Representation of input variables section. 5. Compute the average angle of each variableobservation with the decision function into the KPCA output. Therefore, an average angle using all observations, can be calculated for each variable (Ranking of variables section). 6. Calculate the difference for each variable between the average angle and the median of all variables average angle. The variable closest to the median is classified as the less relevant, as detailed in Ranking of variables section. 7. Remove the least relevant variable. 8. Repeat all the process from 1 to 7 until there is one variable left.
Representation of input variables
We approach the problem of the interpretability of kernel methods by mapping simultaneously data points and relevant variables in a low dimensional linear manifold immersed in the kernel induced feature space H [17]. Such linear manifold, usually a plane, can be determined according to some statistical requirement, for instance, we shall require that the final Euclidean interdistances between points in the plot have to be, as far as possible, similar to the interdistances in the feature space, which shall lead us to the KPCA. We have to distinguish between the feature space H and the surface in that space to which points in input space p actually map, which we denote by ϕðX Þ . In general is a dimensional manifold embedded in H. We assume here that ϕðX Þ is sufficiently smooth that a Riemannian metric can be defined on it [18].
The intrinsic geometrical properties of ϕðX Þ can be derived once we know the Riemannian metric induced by the embedding of ϕðX Þ in H. The Riemannian metric can be defined by a symmetric metric tensor gab. The explicit mapping to construct gab is unkonwn; it can be written solely in terms of the kernel [17].
Any relevant variable can be described by a real valued function f defined on the input space p. Since we assume that the feature map ϕ is one-to-one, we can identify f with ~f ≡ f ∘ϕ1 defined on ϕðX Þ . We aim to represent the gradient of ~f . The gradient of ~f is a vector field defined on ϕðX Þ through its components under the coordinates x = (x1, …, xp) as
grad~f a ¼ Xp gabðxÞDb f ðxÞ a ¼ 1; …; p
ð2Þ
b¼1
where gab is the inverse of the metric matrix G = (gab) and Db denotes the partial derivative with respect the b variable.
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 6 of 18
The curves v corresponding to the integral flow of
the gradient, i.e., the curves whose tangent vectors at t are v0ðtÞ ¼ gradð~f Þ . These curves indicate, locally, the maximum variation directions of ^f . Under the coordinates x = (x1, …, xp) the integral flow is the general
solution of the first order differential equation system
dxa dt
¼
Xp gabðxÞDb f
b¼1
ðxÞ
a
¼
1;
…;
p
ð3Þ
which has always local solution given initial conditions v(t0) = w.
To help interpreting the KPCA output, we can plot the projected v(t) curves (obtained in eq. 3) that indicates, locally, the maximum variation directions of ~f , or also, the corresponding gradient vector given in (2).
Let v(t) = k(∙, x(t)) where x(t) are the solutions of (3). If we define
Zt ¼ ðkðxðtÞ; xiÞÞnx1;
ð4Þ
the induced curve, ~vðtÞ , expressed in matrix form, is given by the row vector



~vðtÞq1xr ¼
Z
0 t
1 n
10n
K
I
n
1 n
1n10n
V~
ð5Þ
where Zt has the form (4), and symbol indicates
transposed.
We can also represent the gradient vector field of ^f ,
that is, the tangent vector field corresponding to curve
v(t) through its projection into the KPCA output. The
tangent vector at t = t0, if x0 = ϕ1 ∘ v(t0) is given by
dv dt
jt¼t0
,
and
its
projection,
in
matrix
form,
is
given
by
the row vector
dd~vt t¼t0
!
1xr
¼
dZ0t dt

t
¼t0
 I
n

1 n
1n10n
V~
ð6Þ
with
dZ0t dt

t¼t0
¼
dZ1t dt

t¼t0
;
…;
dZnt dt

t¼t0
!0
;
ð7Þ
and,
dZit dt

t¼t0
¼ ¼
dk
ðxðtÞ; dt
xi
Þ
t¼t0
Xp
a¼1
Da
k
ðx0;
xiÞddxta

t¼t
0
ð8Þ
where
dxa dt
jt¼t0 is
defined
in
(3).
Ranking of variables Our proposal is to take advantage of the representation of direction of input variables applying two alternative approaches:
 To include the SVM predicted decision values for each training sample as an extra variable, what we call reference variable. Then, compare directions of each one of the input variables with the reference.
 To include the direction of the SVM decision function and use it as the reference direction. Since it is as a real-valued function of the original variables we can represent the direction of this expression. Specifically, the decision function removing the sign function of the expression of SVM is given by
Xn
f ðxÞ ¼ αiyikðxi; xÞ þ b
ð9Þ
i¼1
we can reformulate (9) to
Xn
f ðxÞ ¼ ϱikðxi; xÞ þ b
ð10Þ
i¼1
where ϱi = αiyi. Applying the representation of input variables methodology to function (10) and assuming
Gaussian kernel expressed as kx1x2k2Þ , from formula (8),
we obtain
kðx1; x2Þ ¼
expð
1 σ
dZit dt

t¼0
¼
kðxi; xÞ " Xn
Â
Xp Àxai xaÁ
a¼1
ϱi
σ
 xaj xa
 k
À x
j;
# Á x
j¼1
For both prediction values and decision function, we can calculate the overall similarity of one variable with respect the reference (either the prediction or the decision function) by averaging the angle of the maximum growth vector for all training points with the reference. So, if, for a given training point, the angle of the direction of maximum growth of variable p with the reference is 0 (0 rad) would mean that the vector of directions overlap and they are perfectly positively associated. If the angle is 180 (π radians) they go in opposite direction, indicating that they are perfectly negatively associated (Fig. 3). By averaging the angle of all training points we obtain a summary of the similarity of each variable with the reference and, consequently, whether is relevant or not. Assuming that there is noise in real data, a variable is classified as relevant or not compared to the others: the variable closest to the overall angle taking into account all variables is assumed
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 7 of 18
Fig. 3 Visual representation of variable importance. Vectors are the projection on the two leading KPCA axes of the vectors in the kernel feature space pointing to the direction of maximum locally growth of the represented variables. In this scheme, the reference variable is in red and original variables are in black. Each sample point anchors a vector representing the direction of maximum locally growth. a When an original variable is associated with the reference variable, the angle between both vectors, averaged across all samples, is close to zero radians. b In contrast, when an original variable is negatively associated with the reference variable, the angle between both vectors, averaged across all samples, is close to π radians. c When an original variable does not show any association with the reference variable, the angle changes nonconsistently among the samples. In noisy data, behavior (c) is expected to occur in most variables, so the variable with average angle closest to the overall angle after accounting for all variables is assumed to be the least relevant
to be the least relevant. Based on this, we can apply a RFE-KPCA-maximum-growth approach for prediction and for decision function as defined by Fig. 4.
Visualization of importance of variables We can represent for each observation the original variables as vectors (with a pre-specified length), that indicate the direction of maximum growth in each variable or a function of each variable. When two variables are positively correlated, the directions of maximum growth for all samples should appear in the same direction and in the perfect scenario samples should overlap. When two variables are negatively correlated the direction should be overall opposite, i.e., should be a mirror
image, and if they are no correlated, directions should be random (Fig. 3).
Compared scenarios To fix ideas, we applied the three proposed approaches: RFE-pseudo-samples, RFE-KPCA-maxgrowth-prediction and RFE-KPCA-maxgrowth-decision and compared them to the RFE-Guyon for non-linear kernels. These methods are applied to analyse simulated and real time-to-event data with SVM. We simulated a time-to-event response variable and the corresponding censoring distribution. To evaluate the performance of the proposed methods in this survival framework, several scenarios involving different correlated variables have been simulated.
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 8 of 18
Fig. 4 Pseudo-code of the RFE-KPCA-maximum-growth algorithm for both function and prediction approach. The algorithm is applied to a timeto-event (right-censored) response variable
Simulation of scenarios and data generation We generated 100 datasets with a time-to-event response variable and 30 predictor variables following a multivariate normal distribution. The mean of each variable was a realization of a Uniform distribution U(0.03,0.06) and the covariance matrix was computed so that all variables were classified in four groups according to their pairwise correlation: no correlation (around 0), low correlation (around 0.2), medium correlation (around 0.5) and high correlation (around 0.8). The variance distribution of each variable was fixed to 0.7 (see correlation matrix at Additional File 2).
The time-to-event variable was simulated based on the proportional hazards assumption through a Gompertz distribution [19]:


T
¼
1 α
1 γ
α logðUÞ expðhβ; xii
Þ
ð11Þ
where U is a variable following a Uniform(0,1) distribution, β is the coefficients variable vector, α ∈ (−∞, ∞) and γ> 0 are the scale and shape parameters of the Gompertz distribution. These parameters were selected so that overall survival was around 0.6 at 18 months follow-up time.
The number of observations in each dataset was 50 and the time of censoring distribution followed a Uniform allowing around 10% censoring.
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 9 of 18
Relevance of variables scenarios To evaluate the proposed methods, we generated the time-to-event response variable assuming the following scenarios: i) large and low pairwise correlation among predictors, some of them with variables highly associated with the response and others not, ii) positive and negative association with the response variable, and iii) linear and non-linear associations with the response variable and, in some cases, interaction among predictor variables. The relevant variables for each one of the 6 simulated scenarios are:
1. Variable 1. 2. -Variable 29 + Variable 30. 3. -Variable 1 + Variable 8 + Variable 20 + Variable 29
- Variable 30. 4. Variable 1 + Variable 2 + Variable 1 x Variable 2. 5. Variable 1 + Variable 30 + Variable 1 x Variable 30
+ Variable 20 + (Variable 20)2. 6. Variable 1 + (Variable 1)2 + exp(Variable 30).
Real-life datasets The PBC, Lung and DLBCL datasets freely available at the CRAN repository were used as real data to test the performance of the proposed methods. Briefly, datasets of the following studies were analyzed:
 PBC: this data is from the Mayo Clinic trial in primary biliary cirrhosis of the liver conducted between 1974 and 1984. The study aimed to evaluate the performance of the drug D-penicillamine in a placebo controlled randomized trial. This data contains 258 observations and 22 variables (17 of them are predictors). From the whole cohort 93 observations experienced the event, 65 finalized the follow-up period being a non-event, and thus were censored, and 100 were censored before the end of the follow-up time of 2771 days, with an overall survival probability of 0.57.
 Lung: this study was conducted by the North Central Cancer Treatment Group (NCCTG) and aimed to estimate the survival of patients with advanced lung cancer. The available dataset included 167 observations, experiencing 89 events during the follow-up time of 420 days, and 10 variables. A total of 36 observations were censored before the end of follow-up. The overall survival was 0.40.
 DLBCL: this dataset contains gene expression data from diffuse large B-cell lymphoma (DLBCL) patients. The available dataset contains 40 observations and 10 variables representing the mean gene expression in 10 different clusters. From the analysed cohort 20 patients experienced the event, 10 finalized the
follow-up and 8 were right-censored during the 72 months follow-up period.
Cox proportional-hazards models were used and compared with the proposed methods. We applied the RFE algorithm and in each iteration the variable with lowest proportion of explainable log-likelihood in the Cox model was removed. To compare the obtained rank of variables the correlation between the ranks was computed. Additionally, the C statistic was computed by ranked variable and method to evaluate its discriminative ability.
Probabilistic SVM The data was analysed with a modified SVM for survival analysis that was previously considered optimal to handle censored data [20]. The method, known as probabilistic SVM [21] (more details on this method on Additional file 3), allows not perfectly defining some observations and give them an uncertainty in their class. For these uncertainties a confidence level or probability regarding the class is provided.
Comparison of methods The parameters selected to perform the grid-search for Gaussian kernel were 0.25, 0.5, 1, 2 and 4. The C and C~ values were 0.1, 1, 10 and 100. For each combination of parameters, a tunning parameter step with 10 training datasets were fitted and validated using 10 different validation datasets. Additionally, 10 training datasets, different from all datasets used in the tuning parameters step, were simulated and fitted with the best combination found in tuning parameters step. The tuned parameters were fixed for each RFE iteration, i.e., were not estimated at each iteration. Once the optimal parameters for the pSVM were found the methods compared were:
 RFE-Guyon for non-linear data: this method was considered the gold standard.
 RFE-KPCA-maxgrowth-prediction: the KPCA is based on Gaussian kernel with parameters obtained in the pSVM model.
 RFE-KPCA-maxgrowth-decision: the KPCA is based on Gaussian kernel with parameters obtained in the pSVM model.
 RFE-pseudo-samples: the range of the data, to create the pseudo-samples is created split- ting data into 50 equidistant points. The range of the pseudo-samples goes from 2 to 2, since variables are normally distributed around 0 approximately.
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 10 of 18
Metrics to evaluate algorithm performance The mean and standard deviation of the rank obtained in 100 simulated datasets was used to summarize the performance by method and scenario. For the RFE-pseudo-samples algorithm the first iteration figure with all 100 datasets was created summarizing the information by variable. For the RFE-maxgrowth approach, as example, one of the datasets was presented in order to interpret the method, since it was not possible to summarize all 100 principal components plots in one figure.
Results
Simulated datasets In this section, main results are described by algorithm and scenario. Results are structured according to overall ranking of variables and visualization and interpretation of two scenarios for illustrative purposes.
Overall ranking comparison Scenario 1 results are shown in Fig. 5. All 4 methods identified the relevant variable being the RFE-maxgrowth-prediction the one with the lowest average rank (thus, optimal), followed by the RFE-maxgrowth-function, RFE-pseudo-samples and RFE-Guyon. For all methods, except the RFE- Guyon, a set of variables was closest to the Variable 1 rank (variables 2 to 8). These variables were highly correlated with Variable 1.
For scenario 2 (Fig. 6), the true relevant variables were identified for all 4 algorithms, being the average rank pretty similar, except the RFE-maxgrowth-function. The specific overall rank order was RFE-Guyon, RFE-maxgrowth-prediction, RFE-pseudo-samples and RFE-maxgrowth-function. The average rank for the other non-relevant variables was similar for all methods. In this scenario the relevant variables were not correlated with any other variable in the dataset.
In scenario 3 (Fig. 7), 5 variables are relevant in the true model. The algorithms were able to detect the relevant non-correlatedvariables (variables 20, 29 and 30), except the RFE-maxgrowth-function, that for this set of variables was the worst method. For the other 3 algorithms and this set of variables, the RFE-pseudo-samples was slightly better and the RFE-Guyon slightly worst than the others. For the other 2 highly correlated variables (Variable 1 and Variable 8) the two best methods were clearly RFE-pseudo-samples and RFE-maxgrowth-function.
In Scenario 4 (Fig. 8), all methods, except RFE-Guyon, detected the two relevant variables. However, RFE-maxgrowth-function identified as relevant, with a pretty similar rank, variables 3 to 8 (highly correlated with the true relevant ones). The RFE-pseudo-samples algorithm ranks increased as the correlation with the true relevant variables decreased.
For Scenario 5 (Fig. 9) three variables were relevant (1, 20 and 30). An interaction and a quadratic term were included. RFE-pseudo-samples was clearly the method that
Average rank of 100 simulated datasets
RFE Guyon RFE maxgrowth function RFE maxgrowth prediction RFE pseudo samples
30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10
9 8 7 6 5 4 3 2 1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 Variable in the dataset
Fig. 5 Scenario 1 results. Average rank by variable and method for the 100 simulated datasets for Scenario 1 (being Variable 1 the relevant variable).
Dotted vertical black line represents the variable used to generate the time-to-event variable. The lower the rank, the more relevant the variable is for
the specific algorithm
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 11 of 18
Average rank of 100 simulated datasets
RFE Guyon RFE maxgrowth function RFE maxgrowth prediction RFE pseudo samples
30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10
9 8 7 6 5 4 3 2 1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 Variable in the dataset
Fig. 6 Scenario 2 results. Average rank by variable and method for the 100 simulated datasets for Scenario 2 (being variables 29 and 30 the
relevant variables). Dotted vertical black lines represent the variable used to generate the time-to-event variable. The lower the rank, the more
relevant the variable is for the specific algorithm
Average rank of 100 simulated datasets
RFE Guyon RFE maxgrowth function RFE maxgrowth prediction RFE pseudo samples
30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10
9 8 7 6 5 4 3 2 1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 Variable in the dataset
Fig. 7 Scenario 3 results. Average rank by variable and method for the 100 simulated datasets for Scenario 3 (being variables 1, 8, 20, 29 and 30
the relevant variables). Dotted vertical black lines represent the variables used to generate the time-to-event variable. The lower the rank, the
more relevant the variable is for the specific algorithm
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 12 of 18
Average rank of 100 simulated datasets
RFE Guyon RFE maxgrowth function RFE maxgrowth prediction RFE pseudo samples
30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10
9 8 7 6 5 4 3 2 1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 Variable in the dataset
Fig. 8 Scenario 4 results. Average rank by variable and method for the 100 simulated datasets for Scenario 4 (being variables 1 and 2 the relevant
variables). Dotted vertical black lines represent the variables used to generate the time-to-event variable. The lower the rank the more relevant
the variable is for the specific algorithm
Average rank of 100 simulated datasets
RFE Guyon RFE maxgrowth function RFE maxgrowth prediction RFE pseudo samples
30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10
9 8 7 6 5 4 3 2 1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 Variable in the dataset
Fig. 9 Scenario 5 results. Average rank by variable and method for the 100 simulated datasets for Scenario 5 (being variables 1, 20 and 30 the
relevant variables). Dotted vertical black lines represent the variable used to generate the time-to-event variable. The lower the rank the more
relevant the variable is for the specific algorithm
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 13 of 18
Average rank of 100 simulated datasets
RFE Guyon RFE maxgrowth function RFE maxgrowth prediction RFE pseudo samples
30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10
9 8 7 6 5 4 3 2 1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 Variable in the dataset
Fig. 10 Scenario 6 results. Average rank by variable and method for the 100 simulated datasets for Scenario 6 (being variables 1 and 30 the
relevant variables). Dotted vertical black lines represent the variable used to generate the time-to-event variable. The lower the rank the more
relevant the variable is for the specific algorithm
best identified the relevant variables. The other three algorithms were not able to detect the three variables, although RFE-maxgrowth-function was able to identify as relevant, with a similar rank, variables 1 to 8 (highly correlated among them).
In Scenario 6 (Fig. 10), Variable 1 and Variable 30 were selected as relevant; being the former included as main effect with a quadratic term and the latter exponentiated. All methods, except RFE-maxgrowth-function, were able to detect the importance of Variable 30. With respect to Variable 1, RFE-pseudo-samples and RFEMaxgrowth-function yielded a similar rank of approximately 10.5. The other two algorithms, RFE-Guyon and RFE-maxgrowth-prediction, were not able to identify as relevant Variable 1 with the ranks for this variable comparable to to other non-relevant variables.
Visualization of proposed methods RFE-pseudo-samples An example of the results for Scenario 2 (all other scenarios are included as Additional files, from Additional Files 4, 5, 6, 7, 8 and 9), the 100 simulated datasets and first iteration of the RFE algorithm is shown in Fig. 11. Two variables show a completely different pattern from the others: Variable 29 and Variable 30. The association with the response of them was a mirror image of each other: for Variable 30, the
larger the pseudo-sample value the larger the decision value and for Variable 29, the larger the pseudo-sample the lower the decision value. The other variables are pretty constant along the pseudo-samples range.
RFE-KPCA-maxgrowth prediction and function Figure 12 shows an example of RFE-maxgrowth-prediction algorithm, Scenario 1, and iteration 25. To make the plot more interpretable, we only displayed the 5 variables selected as the most relevant: 1, 2, 25, 26 and 28. The first two were highly correlated (in average, a 0.8 Pearson correlation) and the others were independent by design. The reference is the prediction approach, but it is equivalent to function approach. The first component (PC1) is the one that classifies the event group, most events are negative and non-events are positive. For the reference, the directions are going from non-event to event along the PC1 and PC2. With respect to the other variables, only Variable 1 and Variable 2 present a pattern in terms of directions for each observation similar to the reference. Variables 25, 26 and 28 look pretty random. The interpretation of this is: variables 1, 2 and the reference perform similarly, thus, Variable 1 and Variable 2 are relevant and the others are not. Besides that, since 25, 26 and 28 directions are random between them, they are not associated with the response and they are not correlated, which is true by the data generation mechanism.
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 14 of 18
0.0 var29
var30
Decision value
0.2 vvvvvvvvvvvvvaaaaaaaaaaaaarrrrrrrrrrrrr111112222212345678901234567890345678
0.4
vvvvvvvvvvvvvaaaaaaaaaaaaarrrrrrrrrrrrr1111112222221234567890123456789135678
var30
var29
0.6
2
1
0
1
2
Pseudo sample value
Fig. 11 Visualization of RFE-pseudo-samples results for Scenario 2. Results for Scenario 2 (in which variables 29 and 30 were the relevant
variables) over all 100 simulated datasets, all 30 variables, and first iteration of the RFE-pseudo-samples algorithm. The pseudo-samples
distribution for each variable is shown with a non-parametric local regression estimation (LOESS) with the corresponding 95% confidence interval
Real-life datasets In Fig. 13 the Spearman correlation between each method comparing the obtained ranks for each one of the variables in the three dataset is shown. In all three compared real datasets the RFE-pseudo-samples and RFE-maxgrowth-prediction were the methods most correlated with the Cox model. In the Additional Files 10, 11 and 12, the rank comparison between each method and PBC, DLBCL and Lung datasets, respectively, is presented.
From Figs. 14, 15 and 16 the C statistic results by method and real dataset are shown. The RFE-pseudo-samples method discriminative ability is better than the other ones, especially in the DLBCL and PBC dataset, were the C statistics of the top ranked variables (the ones classified by the algorithm as more relevants) are larger. The RFE-maxgrowth methods perform slightly better than the RFE-Guyon except in DLBCL dataset (Fig. 16) were RFE-Guyon performance is overall better being the C statistic better in larger ranks.
Discussion In biomedical research, it is important to select the variables most associated with the studied outcome and to learn about the strength of this association. In SVM with non-linear kernels, variable selection is particularly challenging because the feature and input spaces are different, thus learning about variables in the feature space does not address the main question about variables in the original
space. Although non-linear kernels, specially the Gaussian kernel, are widely used, little work has been done comparing methods to select variables in SVM with non-linear kernels. Moreover, almost no work has focused on interpretation and visualization of the association predictor-response in SVM with linear or non-linear kernels to help the analyst to not only select variables but also learn about the strength and direction of the association. The algorithms we proposed here for SVM aimed to fill this gap and allow analysts to use SVM to better address common scientific questions, i.e.: select variables when using non-linear kernels and learn about the strength of associations of predictor-response. Moreover, the algorithms presented are applicable for analysis of time-to-event responses that are often the primary outcomes in biomedical research.
The three algorithms we proposed performed generally better than the gold standard RFE-Guyon for non-linear kernels. As expected, results for all methods were better when the true relevant variables were independent, i.e., they were no correlated with the other variables in the SVM model. However, this scenario is rarely the case in biomedical research, particularly when analysis includes several variables. Generally, the RFE-pseudo-samples outperformed the other three methods in all tested scenarios. Additionally, the RFE-pseudo-samples algorithm rendered a more friendly visualization of results than RFE-Guyon.
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 15 of 18
Event Non Event Censoring
KPCA Maxgrowth
var1
5.0
var2
2.5
0.0
PC 2
2.5
var25
5.0
var26
var28
2.5
0.0
2.5
4
2
0
2
4
4
2
0
2
4
PC 1
4
2
0
2
4
Fig. 12 Visualization of RFE-KPCA-maxgrowth results for Scenario 1. Scenario 1 (being Variable 1 the relevant variable) results for a random simulated dataset and iteration 25 of the RFE-KPCA-maxgrowth-prediction approach. The first component of the KPCA (PC1) is represented in the X-axis and the second component (PC2) is represented in the Y-axis. Events, non-events (censored at the end of follow-up time) and losses to follow-up (censored during follow-up) are represented by red, green and blue color, respectively
With regards to the RFE-maxgrowth, both prediction and function approaches performed similarly. The prediction approach identified the relevant variables better than the fuction approach and the function was less time consuming. The prediction approach can be interpreted as an
instance of the function. Although the RFE-maxgrowthfunction was based on the explicit decision function and, thus, was expected to outperform the other three approaches, it did not perform as accurately as the other three approaches. One explanation could be that by
Fig. 13 Spearman correlation matrix comparing 5 methods in the a PBC, b Lung and c DLBCL datasets. The Spearman correlation was computed comparing the ranks obtained by each one of the variables in the dataset
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 16 of 18
Fig. 14 Discriminative ability, measured as C statistic, by method and ranked variable in the PBC dataset. The X-axis shows the rank of each one of the variables in the dataset after applying the RFE algorithm. The lower the rank the more relevant the variable is and the larger the C statistic is expected. As each method can rank differently the variables, given a rank the variable can be different between methods, due to this the C statistic (Y-axis) is different
approaching the decision function with a non-linear kernel as a combination of variables we are loosing more information than by using the RFE-maxgrowth-prediction.
In the RFE-maxgrowth-prediction algorithm, the prediction was included as an extra variable into the KPCA space. When including this extra variable, the constructed space accounts for the patterns that define event and non-event into the KPCA and is different from the constructed space ignoring the prediction variable. However, in the RFE-maxgrowth-function the KPCA space does not take into account any specific variable directly related to the classes.
The interpretation of the RFE-maxgrowth algorithm is more complex than the RFE-pseudo-samples algorithm because it includes interpretation of the components of the KPCA, the directions of maximum growth of each input variable, and the comparison of the
direction of the maximum growth of the input variables between the event and non-events. Although this approach is more informative, it can only be interpreted for a reduced number of variables.
When analyzing the three real datasets the three SVM methods performed overall better than Cox model which is the classical statistical model to analyze time-to-event data. Moreover, the three real datasets fit in terms of sample size and number of variables into the Cox assumptions. Within the proposed methods the RFE-pseudo-samples performed better than the others, being the top-ranked variables the ones with largest discriminative power. The RFE-maxgrowth methods performed slightly better than RFE-Guyon. The obtained results in the real datasets are consistent with the ones obtained in the simulation study.
Fig. 15 C statistics results by method and ranked variable in the Lung dataset. The X-axis shows the rank of each one of the variables in the dataset after applying the RFE algorithm. The lower the rank the more relevant the variable is and the larger the C statistic is expected. As each method can rank differently the variables, given a rank the variable can be different between methods, due to this the C statistic (Y-axis) is different
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 17 of 18
Fig. 16 C statistics results by method and ranked variable in the DLBCL dataset. The X-axis shows the rank of each one of the variables in the dataset after applying the RFE algorithm. The lower the rank the more relevant the variable is and the larger the C statistic is expected. As each method can rank differently the variables, given a rank the variable can be different between methods, due to this the C statistic (Y-axis) is different
The main limitation of the proposed methods is that they are more computationally intensive than classical RFE-Guyon. That could be a limitation depending on the size of the database, the proportion of censored observations during the follow-up period or the SVM extension model used to analyze the time-to-event data. However, this shouldnt be an extra complexity point when analyzing binary response data with no censored obsevations.
Further extensions of the presented work are the comparisons of the proposed methods with other machine learning agorithms used to identify relevant variables such as Random Forest, Elastic Net or Correlation-based Feature Selection evaluator, by analyzing simulated scenarios and real datasets. Additionaly, future work should focus in another important part of the identification of relevant features which is finding the method with largest accuracy or discriminatory ability and not only the identification of the true relevant variables.
Conclusion Conducting variable selection and interpreting associations between predictors and response variables with the proposed approaches when analyzing biomedical data using SVM with non-linear kernels has some advantadges over the currently available RFE of Guyon. Additionally, the proposed approaches can be implemented with high level of accuracy and speed, and with low computational cost, particularly when using the RFEpseudo-samples algorithm. Although the proposed methods had more difficulties to identify relevant variables when those variables were highly correlated, they performed better than the classical RFE algorithm with non-linear kernels proposed by Guyon.
Additional files
Additional file 1: Kernel feature space and kernel principal component analysis methodology. (DOCX 42 kb)
Additional file 2: Pearson correlation matrix of the 30 variables simulated. (PDF 131 kb)
Additional file 3: Probabilistic support vector machine methodology. (DOCX 36 kb)
Additional file 4: Visualization of RFE-pseudo-samples results for Scenario 1. Scenario 1 (being Variable 1 the relevant variable) results for all 100 simulated datasets, all 30 variables and first iteration of the RFEpseudo-samples algorithm. The pseudo-samples distribution for each variable is shown with a non-parametric local regression estimation (LOESS) with the corresponding 95% confidence interval. (PDF 61 kb)
Additional file 5: Visualization of RFE-pseudo-samples results for Scenario 2. Scenario 2 (being Variable 29 and 30 the relevant variables) results for all 100 simulated datasets, all 30 variables and first iteration of the RFE-pseudo-samples algorithm. The pseudo-samples distribution for each variable is shown with a non-parametric local regression estimation (LOESS) with the corresponding 95% confidence interval. (PDF 59 kb)
Additional file 6: Visualization of RFE-pseudo-samples results for Scenario 3. Scenario 3 (being Variable 1, 8, 20, 29 and 30 the relevant variables) results for all 100 simulated datasets, all 30 variables and first iteration of the RFE-pseudo-samples algorithm. The pseudo-samples distribution for each variable is shown with a non-parametric local regression estimation (LOESS) with the corresponding 95% confidence interval. (PDF 57 kb)
Additional file 7: Visualization of RFE-pseudo-samples results for Scenario 4. Scenario 4 (being Variable 1 and 2 the relevant variables) results for all 100 simulated datasets, all 30 variables and first iteration of the RFE-pseudosamples algorithm. The pseudo-samples distribution for each variable is shown with a non-parametric local regression estimation (LOESS) with the corresponding 95% confidence interval. (PDF 61 kb)
Additional file 8: Visualization of RFE-pseudo-samples results for Scenario 5. Scenario 5 (being Variable 1, 20 and 30 the relevant variables) results for all 100 simulated datasets, all 30 variables and first iteration of the RFE-pseudo-samples algorithm. The pseudosamples distribution for each variable is shown with a non-parametric local regression estimation (LOESS) with the corresponding 95% confidence interval. (PDF 53 kb)
Additional file 9: Visualization of RFE-pseudo-samples results for Scenario 6. Scenario 6 (being Variable 1 and 30 the relevant variables) results for all 100 simulated datasets, all 30 variables and first iteration of
Sanz et al. BMC Bioinformatics (2018) 19:432
Page 18 of 18
the RFE-pseudo-samples algorithm. The pseudo-samples distribution for each variable is shown with a non-parametric local regression estimation (LOESS) with the corresponding 95% confidence interval. (PDF 60 kb) Additional file 10: Results for PBC dataset comparing the four RFE algorithms and the Cox model. (PDF 6 kb) Additional file 11: Results for DLBCL dataset comparing the four RFE algorithms and the Cox model. (PDF 5 kb) Additional file 12: Results for Lung dataset comparing the four RFE algorithms and the Cox model. (PDF 5 kb)
Abbreviations KPCA: Kernel principal component analysis; RFE: Recursive Feature Elimination; SVM: Support vector machines
Acknowledgements Not applicable.
Funding This work was funded by Grant MTM201564465-C21-R (MINECO/FEDER) from the Ministerio de Economía y Competitividad (Spain) to JMO and EV. The funding didnt play any role in the design of the study and collection, analysis, and interpretation of data and in writing the manuscript.
Availability of data and materials Simulated datasets during the current study are available from the corresponding author on reasonable request. PBC and Lung datasets are freely available at https://CRAN.R-project.org/ package=survival. DLBCL dataset is freely available at https://CRAN.R-project.org/package=ipred.
Authors contributions HS designed the study and carried out all programming work. FR supervised and provided input on all aspects of the study. CV provided helpful information from the design of the study perspective. FR, EV and JMO contributed algorithms for kernel methods. HS and FR discussed the results and wrote the manuscript. All authors have read and approved the final manuscript.
Ethics approval and consent to participate Not applicable.
Consent for publication Not applicable.
Competing interests The authors declare that they have no competing interests.
Publishers Note
Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.
Author details 1Department of Genetics, Microbiology and Statistics, Faculty of Biology, Universitat de Barcelona, Diagonal, 643, 08028 Barcelona, Catalonia, Spain. 2Department of Osteopathic Medical Specialties, Michigan State University, 909 Fee Road, Room B 309 West Fee Hall, East Lansing, MI 48824, USA. 3Department of Immunology and Infectious Diseases, Harvard T.H. Chen School of Public Health, 675 Huntington Ave, Boston, MA 02115, USA. 4Centre for Genomic Regulation (CRG), The Barcelona Institute for Science and Technology, Dr. Aiguader 88, 08003 Barcelona, Spain.
Received: 7 May 2018 Accepted: 30 October 2018
3. Maldonado S, Weber R. A wrapper method for feature selection using support vector machines. Inf Sci. 2009;179:220817.
4. Aytug H. Feature selection for support vector machines using generalized benders decomposition. Eur J Oper Res. 2015;244:2108.
5. Weston J, Mukherjee S, Chapelle O, Pontil M, Poggio T, Vapnik V. Feature selection for SVMs. In: Proceedings of the 13th International Conference on Neural Information Processing Systems: Neural information processing systems Foundation. Cambridge: MIT Press; 2000. vol. 13, p. 64753.
6. Benders JF. Partitioning procedures for solving mixed-variables programming problems. Numer Math. 1962;4:23852.
7. Becker N, Werft W, Toedt G, Lichter P, Benner A. penalizedSVM: a R-package for feature selection SVM classification. Bioinformatics. 2009;25:17112.
8. Becker N, Toedt G, Lichter P, Benner A. Elastic SCAD as a novel penalization method for SVM classification tasks in high-dimensional data. BMC Bioinformatics. 2011;12(1):138.
9. Saeys Y, Inza I, Larrañaga P. A review of feature selection techniques in bioinformatics. Bioinformatics. 2007;23:250717.
10. Liu Q, Chen C, Zhang Y, Hu Z. Feature selection for support vector machines with RBF kernel. Artif Intell Rev. 2011;36:99115.
11. Alonso-Atienza F, Rojo-Álvarez JL, Rosado-Muñoz A, Vinagre JJ, Garc\iaAlberola A, Camps-Valls G. Feature selection using support vector machines and bootstrap methods for ventricular fibrillation detection. Expert Syst Appl. 2012;39:195667.
12. Krooshof PWT, Üstün B, Postma GJ, Buydens LMC. Visualization and recovery of the (bio) chemical interesting variables in data analysis with support vector machine classification. Anal Chem. 2010;82:70007.
13. Postma GJ, Krooshof PWT, Buydens LMC. Opening the kernel of kernel partial least squares and support vector machines. Anal Chim Acta. 2011; 705:12334.
14. Ruppert D. Statistics and data analysis for financial engineering. Springer: New York; 2011.
15. Leys C, Ley C, Klein O, Bernard P, Licata L. Detecting outliers: do not use standard deviation around the mean, use absolute deviation around the median. J Exp Soc Psychol. 2013;49:7646.
16. Reverter F, Vegas E, Oller JM. Kernel-PCA data integration with enhanced interpretability. BMC Syst Biol. 2014;8(2):S6.
17. Scholkopf B, Smola AJ. Learning with kernels: support vector machines, regularization, optimization. MIT Press: Cambridge; 2001.
18. Scholkopf B, Mika S, Burges CJC, Knirsch P, Muller K-R, Ratsch G, Smola AJ. Input space versus feature space in kernel-based methods. IEEE Trans Neural Netw. 1999;10:100017.
19. Bender R, Augustin T, Blettner M. Generating survival times to simulate cox proportional hazards models. Stat Med. 2005;24:171323.
20. Shiao H-T, Cherkassky V. SVM-based approaches for predictive modeling of survival data. In: In Proceedings of the International Conference on Data Mining (DMIN); 2013. p. 1.
21. Niaf E, Flamary R, Lartizien C, Canu S. Handling uncertainties in SVM classification. In: Statistical Signal Processing Workshop (SSP); 2011. p. 75760.
References 1. Guyon I, Weston J, Barnhill S, Vapnik V. Gene selection for cancer
classification using support vector machines. Mach Learn. 2002;46:389422. 2. Chen Y-W, Lin C-J: Combining SVMs with various feature selection
strategies. In Feature extraction. Berlin, Heidelberg: Springer; 2006:315324.

View File

@@ -0,0 +1,16 @@
Title: SVM-RFE: selection and visualization of the most relevant features through non-linear kernels
Subject: BMC Bioinformatics, 2018, doi:10.1186/s12859-018-2451-4
Keywords: Support vector machines,Relevant variables,Recursive feature elimination,Kernel methods
Author: Hector Sanz
Creator: Arbortext Advanced Print Publisher 9.1.440/W Unicode
Producer: Acrobat Distiller 10.1.5 (Windows); modified using iText® 5.3.5 ©2000-2012 1T3XT BVBA (AGPL-version)
CreationDate: 11/16/18 14:17:20
ModDate: 11/19/18 15:38:13
Tagged: no
Form: none
Pages: 18
Encrypted: no
Page size: 595.276 x 790.866 pts (rotated 0 degrees)
File size: 2917177 bytes
Optimized: no
PDF version: 1.4

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,222 @@
Testing for statistically significant differences between groups of scan patterns
Matt Feusner University of California, San Francisco
feusnerm@vision.ucsf.edu
Brian Lukoff Stanford University brian.lukoff@stanford.edu
Abstract
alignment produces as its output a distance value, or dissimilarity between the two scan patterns.
Pairwise sequence alignment methods are now often used when analyzing eyetracking data [Hacisalihzade et al. 1992; Brandt and Stark 1997; Josephson and Holmes 2002, 2006; Pan et al. 2004; Heminghous and Duchowski 2006]. While optimal sequence alignment scores provide a valuation of similarity and difference, they do not readily provide a statistical test of similarity or difference. Furthermore, pairwise alignment scores cannot be used to compare groups of scan patterns directly. Using a statistic that compiles these pairwise alignment scores, a statistical evaluation of similarity can be made by repeatedly computing scores from different permutations of scan pattern groupings. This test produces a p-value as a level of statistical significance.
Keywords
Eye tracking, analysis, similarity test, comparison, sequence comparison, scanpath, scan pattern, statistics
1. Introduction
Note that instead of providing a numerical outcome measure for an individual scan pattern, sequence alignment quantifies the dissimilarity between a pair of scan patterns. Thus, one cannot use traditional statistical methods for comparing two groups (e.g., a t-test or Wilcoxon signed-rank test), as there is no direct numerical measure of an individual scan pattern.
One way to compare groups of scan patterns is to use sequence alignment distances with multidimensional scaling (which reduces a matrix of similarity scores to a small number of dimensions) and attempt to cluster the scan patterns. However, statistical analysis has not been done before to compare clusters of sequences in eyetracking data [Josephson and Holmes 2002]. A multiple sequence alignment method can be used to consolidate groups of scanpaths into an average pattern, or consensus alignment [Hembrooke et al. 2006; West et al. 2006] and the pairwise distance can be used to compare the two representative sequences. However, without a distribution or statistical framework, there is no way to test for significance.
In an experiment where subjects are randomly assigned to two groups (e.g., a treatment and control group), researchers typically want to compare the two groups on an outcome measure to see if the two groups performed differently. Often, researchers will select a numerical outcome measure and then perform a statistical test ,such as a t-test, to determine if the observed differences between the groups are due purely to chance.
In this paper, we present a straightforward adaptation to eyetracking research of a statistical procedure that utilizes a pairwise difference measure (sequence alignment) to compare two experimental groups and produce a p-value for significance. The procedure has been applied in other disciplines with other pairwise distance functions [Mantel 1967; Aittokallio et al. 2000; Kropf et al. 2006].
In eyetracking research, measured results generally consist of eye position and pupil size traces over time. Fixations and saccades are often extracted to use outcome measures such as fixation location, fixation duration, and saccade amplitude [Salvucci and Goldberg 2000]. However, those measures tear apart spatiotemporal data that is inherently linked; the natural outcome measure is a scanpath, or scan pattern, consisting of a series of fixations and saccades in both space and time. Sequence alignment algorithms, or optimal matching algorithms, are an excellent tool for analyzing this complex datatype [Salvucci and Anderson 2003], and have been successfully used in many eyetracking studies [Josephson and Holmes 2002, 2006; Pan et al. 2004; Myers 2005; West et al. 2006]. Sequence alignment works by computing the minimum number or magnitude of edit operations needed to transform one sequence into the other. Edit operations usually include insertion, deletion, and substitution [Josephson and Holmes 2002]. Given two scan patterns, sequence
Copyright © 2008 by the Association for Computing Machinery, Inc. Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for components of this work owned by others than ACM must be honored. Abstracting with credit is permitted. To copy otherwise, to republish, to post on servers, or to redistribute to lists, requires prior specific permission and/or a fee. Request permissions from Permissions Dept, ACM Inc., fax +1 (212) 869-0481 or e-mail permissions@acm.org. ETRA 2008, Savannah, Georgia, March 2628, 2008. © 2008 ACM 978-1-59593-982-1/08/0003 $5.00
2. Procedure
Suppose that an experiment is set up where there are n subjects in
one group and m subjects in another. Given any arbitrary
grouping of all n + m subjects into two groups (not necessarily the
experimental grouping) of sizes n and m, we can calculate
d* = dbetween dwithin,
(1)
where dbetween is the average distance between the scan patterns of
subjects in different groups, and dwithin is the average distance
between the scan patterns of subjects in the same group. For a
random grouping, we would expect d* to be close to 0 and equally
likely to be positive or negative, since there is no reason to expect
that subjects in the same group have scan patterns more or less
similar to each other than subjects in different groups. Thus, over
all of the possible
⎜⎜⎝⎛
n
+m m
⎟⎟⎠⎞
=
(n + m)! n!m!
(2)
groupings of all n + m subjects into two groups, the distribution of d* is symmetric about its mean of 0. A positive d* statistic indicates that scan patterns in different groups are on average farther apart than scan patterns in the same group. Informally, this would mean that the particular grouping of subjects leads to two groups that each cluster together.
43
Permutation tests are a type of nonparametric statistical method that allow the researcher to distinguish d* values that are the result of random noise (e.g., an outlier subject) from d* values that represent a true difference between groups. The null hypothesis in the test is that each grouping is equally likely; in other words, that the experimental grouping is essentially just a random grouping of subjects into the appropriately sized groups. As in all statistical tests, the p-value is the probability of obtaining a grouping with a d* value as high as the one observed in the experimental grouping if in fact the null hypothesis is true. Computing the p-value in a permutation test is simple: examine all possible groupings of subjects and compute a d* statistic for each; the p-value is the proportion of the groupings that result in a d* statistic at least as large as the one observed in the experimental grouping. p-values from a permutation test are interpreted in the same way one would interpret a p-value from any other statistical test; typically p-values that are 5% or less are considered to be “significant” and worthy of the conclusion that there is a real difference between the experimental groups.
Calculating the p-value in a permutation test can often require a prohibitively large number of groupings, even with relatively small data sets. For two experimental groups of size 15, there are over 150 million (Equation 2). A standard “Monte Carlo” strategy to overcome this computational barrier is to select a random subset of groupings that is a more manageable size (e.g., 1,000 or 10,000). Although it makes the permutation test computationally tractable, the p-value we obtain will have some random error in it due to the fact that we randomly selected only a subset of the total number of groupings to examine. By selecting a random subset of groupings, we ensure that the estimated pvalues given by the Monte Carlo procedure will average out to the exact p-value in the long term. For practical purposes, one may, for example, simply select the largest computationally feasible random subset of groupings to examine, or create confidence intervals for the estimated p-values [Nettleton and Doerge 2000].
3. Results
While the test can be used with any pairwise distance function, it is helpful first to explore the behavior of the test using a string edit distance to compare sets of synthetic data. Here the intent is to verify that the statistical test produces the results that are expected based on how the data were generated.
First, two groups (A and B) of artificial scan patterns are plotted from the same process, a rectangular pattern with a small amount of random noise. Each of the 7 scans in each group had 4 fixations on each side of a square shape, moving clockwise from the top left corner (Figure 1). In order to induce both substitution and insertion/deletion edits, fixations were initially spaced 100 pixels apart and then adjusted using random noise variables in both the x and y directions that were uniformly distributed in [100, 100]. The substitution parameter is the Euclidean distance between fixations, and the gap penalty parameter was 139.29 pixels, the length of the average saccade for both sets. The d* statistic computed was -21.47. Running all possible 3432 permutations (Equation 2), 3168 were less than or equal to d*, resulting in a p-value of 0.92308, which rightly fails to reject the null hypothesis that the sets are the same (Table 1).
Next, a new group was generated with the same rectangular processes, but moving in the opposite counterclockwise direction. This group was compared with group A from the previous test using a string edit gap parameter of 145.34, again the average saccade distance. The d* statistic was 405.23, and the p-value
Figure 1: An initial scanpath (dashed) and a final scanpath used in the data after random noise was added (solid). The dashed
square is 300 x 300 pixels. Arrows indicate direction of saccades.
was 2/3432 = 0.00058, correctly indicating a significant difference (Table 1).
Having verified the appropriate results for both similar and dissimilar data, it is interesting to see what type of data is at a borderline level of significance. The random noise is increased in 50 pixel steps for both the clockwise and counterclockwise scans, so that each fixation is moved from its point on the original square by a random variable that is uniformly distributed on [-r, r].
At r = 300 the p-value begins to lose significance (Table 2). This result makes sense since the side of the square is, on average, 300 pixels. As r is increased, the transition from positive to negative values of d* is also at r = 300 (Figure 2).
A more interesting test is to reanalyze the data of another study conducted with other methods of analysis. In a study by Dixon et al. [2006], subjects were eyetracked viewing movies under different viewing conditions: visible light, infrared (IR) light, and 3 experimental combinations of the two: a simple average (AVE), a complex wavelet transformation (CWT) and a discrete wavelet transformation (DWT). In the movie, subjects were asked to signal when a moving figure came to a certain location in a forest, and their accuracy tracking the figure was measured. Dixon et al. found that accuracy performance was significantly different between the visible light and combined conditions, and between the IR light condition and the DWT combined condition. (This
Table 1. Synthetic data: comparisons with data group A
Data
Gap
d*
p
group
parameter
B
139.29
-21.47 3168/3432 = 0.92308
Reversed 145.34
405.23 2/3432
= 0.00058
Table 2. Synthetic data: comparisons for changing random noise
r
Gap
d*
p
parameter
50 108.68
530.16 1/3432
= 0.00029
100 140.22
415.22 2/3432
= 0.00058
150 179.91
196.83 2/3432
= 0.00058
200 232.87
338.85 1/3432
= 0.00029
250 361.73
146.77 46/3432 = 0.01340
300 437.52
189.54 20/3432 = 0.00583
350 652.02
-52.94 1985/3432 = 0.57838
400 761.24
-16.46 1444/3432 = 0.42075
450 1094.29
-164.05 2432/3432 = 0.70862
500 1203.45
-193.14 2990/3432 = 0.87121
44
dataset was made available online at http://www.cis.rit.edu/pelz/ scanpaths/data/bristol-eden.htm.) To run the permutation test on this dataset, 30 scans (3 sessions by 10 subjects) were grouped together for each of the 5 conditions. The string edit distance was again parameterized by a Euclidean distance substitution penalty and average saccade length gap penalty. 1000 Monte Carlo samples were taken in each test. The visible light condition was significantly different from all of the other conditions (IR: p = 0.011; AVE: p = 0.027; CWT: p = 0.014; DWT: p = 0.029). This result confirms the accuracy result from the original study for the 3 combined-viewing conditions. However, the visible light condition scans were different from the IR condition scans, while the accuracy result was not significantly different. Therefore the permutation test result provides evidence that viewing patterns can be significantly different despite similar tracking accuracy. Results comparing the IR condition scans to scans in the remaining 3 combined conditions were not significant (AVE: p = 0.56; CWT: p = 0.384; DWT: p = 0.517). There was a significant difference in accuracy between the IR and DWT conditions that was not reiterated here. Hence there is also evidence that similar viewing patterns can still produce
Figure 2: dbetween and dwithin (top) and d* (bottom) as a function of the noise parameter r. When dwithin > dbetween, d* is negative.
significantly different tracking accuracy. In agreement with the accuracy results, the permutation test found no significant differences in the combined viewing conditions (AVE and CWT: p = 0.16; AVE and DWT: p = 0.288; CWT and DWT: p = 0.523).
4. Discussion
One important strength of the permutation test is that it is nonparametric; in other words, it does not assume that the distances take on any particular distribution. Consequently, any distance function can be used: one can easily change the parameters of the string edit distance (i.e., the specific penalties for insertion, deletion, and substitution) or even use another pairwise distance function altogether.
For simplicity, the particular sequence alignment distance function we used in this study does not take pupil size and fixation durations into account when computing the distance. In other words, two scanpaths produced by subjects whose eyes follow the same geographical path will have a distance of 0, despite even wild variation in fixation duration and pupil size. One could easily overcome this limitation by using different substitution functions and gap penalties that include these extra measurements as weights or extra dimensions, linking many possible measurements to the statistical test.
Another modification to the sequence alignment parameters would be to incorporate smooth pursuit eye movements. Video images, such as those analyzed here, often induce smooth pursuit tracking eye movements in addition to fixations and saccades [Dixon et al. 2007]. Like the Dixon et al. 2006 study, we did not explicitly analyze smooth pursuit movements. However, one could use a substation function that conditionally uses different metrics to compute distance scores depending on the types of eye movements (fixations or smooth pursuit movements). Such a function could then be used in place of the simple Euclidean distance as a parameter for the sequence alignment algorithm.
Depending on the researchers substantive interest, other distance functions may be better suited to compare individual scanpaths. For example, the area of the convex hull or circle circumscribed around the scanpath would indicate how focused the eye movements are in a single scanpath [Goldberg and Kotval 1999]. While these measures necessarily oversimplify the representation of scanpath by only quantifying extent of focus and not taking other features (e.g., fixation density or timing) into account, they do allow for a single number to be assigned to a scanpath. Since a pairwise distance function can be built from any such computed value by taking arithmetic differences, the permutation test described here might also be conducted using these distances. The choice of distance function should be theoretically motivated; one should choose a specific distance function that reflects whatever substantive differences between scanpaths the researcher is interested in.
Our reanalysis of the data in the 2006 study by Dixon et al. echoes only some of the results found by the studys researchers. One reason may be that examining the entire scanpath provides a different picture of the overall results than examining only a single numerical measure summarizing tracking accuracy. However, another reason for the difference may be the statistical differences between the analysis described here and the analyses described by Dixon et al. First, our analysis does not adjust for the fact that each group consisted of the same ten subjects producing repeated scanpaths, which does not take into account the likely similarity between scanpaths produced by the same
45
subject. While Dixon et al. correct for this by using a repeated measures ANOVA, the permutation test does not take this interdependence into account. Second, the p-values we report above are uncorrected for multiple comparisons (Dixon et al. uses Tukeys HSD). Future work should refine the permutation test to account for these complications in the study design.
There are two important practical issues to consider before using the permutation test. First, researchers must determine the specific distance function used to quantify the difference between two scanpaths, and the parameters of that function (e.g., the substitution, insertion, and deletion parameters of the string edit distance). Second, researchers must confront the more general statistical issues of any Monte Carlo permutation test, particularly how many permutations to sample (1000, 10000, or more?) given time constraints and the desired power of the test.
5. Conclusion
When a researcher conducts an experiment where the outcomes are scanpaths, it is important to be able to determine statistically whether the observed differences between the scanpaths in each experimental group are due to real differences between the groups or simply due to random variation. Methods that rely on human judgment to determine whether there is a real difference between the two groups are susceptible to bias from researcher expectation about what the result should be, so it is important to have a statistical decision method. The test presented here is applicable whenever a researcher conducts an experiment where the outcome is a scanpath, and is flexible enough to accommodate any pairwise distance function. The optimal sequence alignment algorithm is shown to be a reasonable choice for computing pairwise distances. Even in studies where the scanpath is not the primary outcome of interest, the permutation test can still yield useful and interesting results because it can illuminate cases when there are differences between scanpaths that are not detected by the outcome measure targeted by the researchers.
Acknowledgements
We thank Laura Granka, Timothy Dixon, and John Economides for their comments and help proofreading this work.
References
AITTOKALLIO, T., OJALA, P., NEVALAINEN, T. J., and NEVALAINEN, O. 2000. Analysis of similarity of electrophoretic patterns in mRNA differential display. Electrophoresis, 21, 2947-56.
BRANDT, S. A., and STARK, L. W. 1997. Spontaneous eye movements during visual imagery reflect the content of the visual scene. Journal of Cognitive Neuroscience, 9, 27-38.
DIXON, T. D., LI, J., NOYES, J. M., TROSCIANKO, T., NIKOLOV, S. G., LEWIS, J., CANGA, E. F., BULL, D. R., and CANAGARAJAH, C. N. 2006. Scanpath analysis of fused multi-sensor images with luminance change: a pilot study. 9th International Conference on Information Fusion (ICIF '06), 1-8.
DIXON, T. D., NIKOLOV, S. G., LEWIS, J. J., LI, J., CANGA, E. F., NOYES, J. M., TROSCIANKO, T., BULL, D. R., and CANAGARAJAH, C. N. 2007. Assessment of fused videos using scanpaths: a comparison of data analysis methods. Spatial Vision, 20, 437466.
GOLDBERG, J. H., and KOTVAL, X. P. 1999. Computer interface evaluation using eye movements: methods and constructs. International Journal of Industrial Ergonomics, 24, 631-645.
HACISALIHZADE, S., STARK, L., and ALLEN, J. 1992. Visual perception and sequences of eye movement fixations: a stochastic modeling approach. IEEE Transactions on Systems, Man and Cybernetics, 22, 474-481.
HEMBROOKE, H., FEUSNER, M., and GAY, G. 2006. Averaging scan patterns and what they can tell us. Proceedings of the 2006 symposium on Eye Tracking Research & Applications, 41.
HEMINGHOUS, J. and DUCHOWSKI, A. D. 2006. iComp: a tool for scanpath visualization and comparison. Proceedings of the 3rd symposium on Applied Perception in Graphics and Visualization (APGV '06), 152.
JOSEPHSON, S. and HOLMES, M. E. 2002. Visual attention to repeated internet images: testing the scanpath theory on the world wide web. Proceedings of the 2002 symposium on Eye Tracking Research & Applications, 43-49.
JOSEPHSON, S. and HOLMES, M. E. 2006. Clutter or content? How on-screen enhancements affect how TV viewers scan and what they learn. Proceedings of the 2006 symposium on Eye Tracking Research & Applications, 155-162.
KROPF, S., LUX, A., ESZLINGER, M., HEUER, H., and SMALLA, K. 2006. Comparison of independent samples of highdimensional data by pairwise distance measures. Biometrical Journal, 48, 1-12.
MANTEL, N. 1967. The detection of disease clustering and a generalized regression approach. Cancer Research, 27, 209220.
MYERS, CW. 2005. Toward a method of objectively determining scanpath similarity. Journal of Vision, 5, 693
NETTLETON D. and DOERGE R. W. 2000. Accounting for Variability in the use of permutation testing to detect quantitative trait loci. Biometrics, 56, 52-58.
PAN, B., HEMBROOKE, H. A., GAY, G. K., GRANKA, L. A., FEUSNER, M. K., NEWMAN, J. K. 2004. The determinants of web page viewing behavior: an eye-tracking study. Proceedings of the 2004 symposium on Eye Tracking Research & Applications, 147-154.
SALVUCCI, D. D. and ANDERSON, J. R. 2001. Automated eyemovement protocol analysis. Human Computer Interaction, 6, 39-86.
SALVUCCI, D. D. and GOLDBERG, J. H. 2000. Identifying fixations and saccades in eye-tracking protocols. Proceedings of the 2000 symposium on Eye Tracking Research & Applications, 71-78.
WEST, J. M., HAAKE, A. R., ROZANSKI, E. P., and KARN, K. S. 2006. eyePatterns: software for identifying patterns and similarities across fixation sequences. Proceedings of the 2006 symposium on Eye Tracking Research & Applications, 149-154.
46

View File

@@ -0,0 +1,14 @@
Title: Microsoft Word - final lbr bw.doc
Author: feusnerm
Creator: PScript5.dll Version 5.2
Producer: Acrobat Distiller 6.0.1 (Windows); modified using iText 4.2.0 by 1T3XT
CreationDate: 01/09/08 09:49:51
ModDate: 09/10/25 06:00:37
Tagged: no
Form: none
Pages: 4
Encrypted: no
Page size: 612 x 792 pts (letter) (rotated 0 degrees)
File size: 336157 bytes
Optimized: no
PDF version: 1.6

Some files were not shown because too many files have changed in this diff Show More