@InProceedings{renear07:FRBR,
  author = 	 {Renear, Allen H. and David Dubin},
  title = 	 {Three of the Four {FRBR} Group 1 Entity 
                  Types are Roles, not Types},
  booktitle =	 {Proceedings of the 70th Annual Meeting of the American Society for Information Science and Technology},
  year =	 2007,
  editor =	 {Grove, Andrew},
  address =	 {Medford, NJ},
  publisher =	 {Information Today, Inc.},
  abstract = {We examine the conceptual model of the "bibliographic
              universe" presented in IFLA's Functional Requirements
              for Bibliographic Records (FRBR) and argue, applying
              ontology design recommendations proposed by N. Guarino
              and C. Welty, that three of the four Group 1 entity
              types are more accurately conceptualized as roles. We
              show how this approach may generalize the solution to a
              previously identified puzzle regarding the FRBR entity
              type of XML documents and speculate as to the sorts of
              entities that might take on these roles. This view of
              bibliographic entities, that they are roles that other
              things have in particular social contexts is consistent
              with John Searle's notion of a cascade of social facts
              established through collective intentionality. We allow
              that even if our analysis is correct the current FRBR
              approach may be preferable as there are good reasons for
              "denormalized ontologies" that treat roles as types,
              particularly when the objective is not a general
              ontology, but a practical conceptual model.},
  url = {http://hdl.handle.net/2142/9094}
}

@InProceedings{dubin06:extreme,
  author = 	 {David Dubin and Joe Futrelle and Joel Plutchak},
  title = 	 {Metadata Enrichment for Digital Preservation},
  booktitle =	 {Proceedings of Extreme Markup Languages 2006},
  year =	 2006,
  editor =	 {Usdin, B. T},
  address =	 {Montreal, Quebec},
  month =	 {August},
  url = {http://hdl.handle.net/2142/9461},
  abstract = {Description of structural and semantic relationships and
  properties of, within, and between resources is seen as a key issue
  in digital preservation. But the markup languages used to encode
  descriptions for migration between and storage within digital
  repositories are subject to the same interpretive problems that
  complicate other uses of markup. This paper reports on a project
  that aims to address these problems by explicating facts that
  otherwise would not support automated inferencing. These facts are
  expressed as RDF (Resource Description Framework) triples, stored in
  and retrieved from a scalable RDF-based repository.}
}

@InProceedings{dubin04:extreme,
  author = 	 {David Dubin and David Birnbaum},
  title = 	 {Interpretation beyond markup},
  booktitle =	 {Proceedings of Extreme Markup Languages 2004},
  year =	 2004,
  editor =	 {Usdin, B. T},
  address =	 {Montreal, Quebec},
  month =	 {August},
  url =          {http://hdl.handle.net/2142/11838},
  abstract = {The meaning conveyed by documents and their markup often goes
        well beyond what can be inferred from the markup alone. It
        often depends on context, so that to interpret document markup
        adequately we must situate markup interpretation within a
        broader interpretive problem. Markup is just one of several
        sources of evidence brought to bear in processing digital
        documents; it must be integrated with other information to be
        exploited fully. An example drawn from an ongoing project on
        the metrical analysis of Russian poetry helps illustrate these
        points: the explicit markup of the rhyme scheme can be
        understood only on the basis of a broader understanding of
        Russian meter and poetics.}, 
}

@InProceedings{dubin03:extreme,
  author = 	 {David Dubin},
  title = 	 {Object mapping for markup semantics},
  booktitle =	 {Proceedings of Extreme Markup Languages 2003},
  year =	 2003,
  editor =	 {Usdin, B. T},
  address =	 {Montreal, Quebec},
  month =	 {August},
  url =          {http://hdl.handle.net/2142/11842},
  abstract = {The BECHAMEL system is a knowledge representation and
       inference environment for expressing and testing semantic rules
       and constraints for markup languages. Written in Prolog, the
       system provides predicates for processing the syntactic
       structures that emerge from a SGML/XML parser, defining object
       classes, instantiating object instances, assigning values to
       properties, and establishing relationships between or among
       object instances.  BECHAMEL uses Prolog's built-in capabilities
       to derive inferences from these facts.

       Part of the ongoing development of BECHAMEL involves
       experimenting with strategies for mapping syntactic relations
       to object relations and properties. This paper describes the
       current strategy, based on a blackboard model. Advantages of
       this approach include context free rules and the potential to
       exploit parallel processing for scalability. It has the
       drawback, however, of not permitting evidence to be described
       in ways people are likely to find natural or familiar. By using
       the current approach to produce formal accounts of the
       semantics of popular markup languages, we hope to learn a great
       deal about the ways markup syntax typically cues semantic
       relationships. That advance in our understanding will inform
       the development of more usable languages for object mapping.},
       }



@InProceedings{renear03:frbr,
  author = 	 {Allen Renear and Christopher Phillippe and Pat Lawton 
                  and David Dubin},
  title = 	 {An {XML} document corresponds to which {FRBR}
                 {Group 1} entity?},
  booktitle =	 {Proceedings of Extreme Markup Languages 2003},
  year =	 2003,
  editor =	 {Usdin, B. T},
  address =	 {Montreal, Quebec},
  month =	 {August},
  url = {http://hdl.handle.net/2142/11885},
  abstract = {The FRBR (Functional Requirements for Bibliographic
      Records), released by the International Federation of Library
      Associations and Institutions in 1998, generalizes and refines
      current practices and theory in library cataloging, presenting a
      compelling natural ontology of entities, attributes, and
      relationships for representing the "bibliographic universe." The
      FRBR framework is extremely influential and increasingly
      accepted as a conceptual foundation for cataloging practice and
      technology in libraries and elsewhere. XML documents as defined
      in the W3C XML 1.0 specification, are now an important part of
      this bibliographic universe and it is natural to ask to which of
      FRBR's "Group 1" entities does the XML document
      correspond. Curiously, there seem to be conflicting arguments
      for assigning the XML document to either of the two plausible
      entity categories: manifestation and expression. We believe
      these difficulties illuminate both the nature of the FRBR
      entities, and the nature of markup. We explore a conjecture that
      an XML document has a double aspect and that whether it is a
      FRBR manifestation or a FRBR expression depends upon context and
      intention. Such a double-aspected nature would not only be
      consistent with previous arguments that the meaning of XML
      markup varies in "illocutionary force" according to context of
      use, but might also help resolve an old puzzle in the humanities
      computing community as to whether markup is "part of" the text
      (Buzzetti 2002). However, there are alternative resolutions to
      explore as well and we seem to still be some distance from a
      full understanding of the issues.},
}

@InProceedings{renear03:DC,
  author = 	 {Allen Renear and Dubin Dubin},
  title = 	 {Towards Identity Conditions for Digital Documents},
  booktitle =	 {Proceedings of the 2003 Dublin Core Conference},
  year =	 2003,
  editor =	 {S. Sutton},
  address =	 {Seattle, WA},
  month =	 {October},
  publisher =	 {University of Washington},
  url = {http://www.siderean.com/dc2003/503_Paper71.pdf},
  abstract = {By "identity conditions" we mean a method for
   determining whether an object x and an object y are the same
   object. Identity conditions are arguably an essential feature of
   any rigorously developed conceptual framework for information
   modeling. Surprisingly, the concept of same document, which is
   fundamental to many aspects of library and information science, and
   to digital libraries in particular, has received little systematic
   analysis. As a result, not only is the concept of a document itself
   under-theorized, but progress on a number of important practical
   problems has been hindered. We review the importance of document
   identity conditions, demonstrate problems with current approaches,
   and discuss the general form a solution must take.}, 
}

@InProceedings{renear08:DC,
  author = 	 {Renear, Allen H. and Wickett, Karen M. and Urban, Richard J.
                  and Dubin, David and Shreeves, Sarah L.},
  title = 	 {Collection/Item Metadata Relationships},
  booktitle =	 {Proceedings of the International Conference on 
                  {Dublin} {Core} and Metadata Applications, Berlin},
  pages =	 {80-89},
  year =	 2008,
  editor =	 {Jane Greenberg and Wolfgang Klas},
  address =	 {Goettingen},
  month =	 {September},
  organization = {Dublin Core Metadata Initiative},
  publisher =	 {Goettingen University Press},
  abstract = {Contemporary retrieval systems, which search across
              collections, usually ignore collection-level metadata.
              Alternative approaches, exploiting collection-level
              information, will require an understanding of the
              various kinds of relationships that can obtain between
              collection-level and item-level metadata. This paper
              outlines the problem and describes a project that is
              developing a logic-based framework for classifying
              collection/item metadata relationships. This framework
              will support (i) metadata specification developers
              defining metadata elements, (ii) metadata creators
              describing objects, and (iii) system designers
              implementing systems that take advantage of
              collection-level metadata. We present three examples of
              collection/item metadata relationship categories,
              attribute/value-propagation, value-propagation, and
              value-constraint and show that even in these simple
              cases a precise formulation requires modal notions in
              addition to first-order logic. These formulations are
              related to recent work in information retrieval and
              ontology evaluation.},
  url = {http://hdl.handle.net/2142/9144}
}

@Article{dubin03:llc,
  author = 	 {David Dubin and C. M. Sperberg-McQueen and Allen Renear
                  and Claus Huitfeldt},
  title = 	 {A Logic Programming Environment for Document
                  Semantics and Inference},
  journal = 	 {Literary and Linguistic Computing},
  year = 	 2003,
  volume =	 18,
  number =	 2,
  pages =	 {225-233},
  url = {http://llc.oxfordjournals.org/cgi/reprint/18/2/225},
  note = {(This is a corrected version of an article that appeared in 18:1 pp. 39-47)},
  abstract = {Markup licenses inferences about a text. But the
  information warranting such inferences may not be entirely explicit
  in the syntax of the markup language used to encode the text. This
  paper describes a Prolog environment for exploring alternative
  approaches to representing facts and rules of inference about
  structured documents. It builds on earlier work proposing an account
  of how markup licenses inferences, and of what is needed in a
  specification of the meaning of a markup language. Our system
  permits an analyst to specify facts and rules of inference about
  domain entities and properties as well as facts about the markup
  syntax, and to construct and test alternative approaches to
  translation between representation layers. The system provides a
  level of abstraction at which the performative or interpretive
  meaning of the markup can be explicitly represented in
  machine-readable and executable form.}, 
}

@InProceedings{lee03:lisa4,
  author = 	 {Jonghoon Lee and David Dubin},
  title = 	 {Vocabulary Mapping in the {NASA} {ADS}: Prospects for 
                  Practical Subject Access},
  booktitle =	 {Library and Information Services in Astronomy {IV}},
  pages =	 {249-256},
  year =	 2003,
  editor =	 {B. Corbin and E. Bryson and M. Wolf},
  address =	 {Washington, DC},
  publisher =	 {U.S. Naval Observatory},
  url =          {http://www.eso.org/sci/libraries/lisa4/Dubin1.pdf},
  abstract =     {
  The popular NASA Astrophysics Data System includes bibliographic
  records indexed with terms from a variety of semi-compatible
  descriptor languages. These include coordinate index terms taken
  from the NASA Thesaurus and Astrophysical Journal subject headings,
  among others. We have worked to develop a system that takes as input
  the NASA terms assigned by professional indexers, and translates
  them into ApJ headings. Our system maps sets of descriptors, rather
  than individual descriptors, since two or more coordinate index
  terms may translate to a single pre-coordinated subject heading. We
  began our study with lexical resemblance as the main source of
  evidence, and later developed a connectionist system that exploits
  patterns of consistent co-assignment in a subset of the ADS
  collection that is indexed using both ApJ headings and NASA
  terms. Our most recent efforts have been aimed at improving the
  network's performance via supervised learning. In this paper we
  present the results of our most recent formal evaluation studies,
  and an examination of some specific documents drawn from a set we've
  mapped using the network.},
 }

@InProceedings{renear03:jcdl,
  author = 	 {Allen Renear and David Dubin and C. M. Sperberg-McQueen
                  and Claus Huitfeldt},
  title = 	 {{XML} semantics and digital libraries},
  booktitle =	 {Proceedings of the third {ACM}/{IEEE-CS} joint conference 
                  on Digital libraries},
  pages =	 {303 - 305},
  year =	 2003,
  editor =	 {C. C. Marshall and G. Henry and L. Delcambre},
  address =	 {Los Alamitos, CA},
  publisher =	 {IEEE},
  url =          {http://portal.acm.org/citation.cfm?id=827192},
  abstract = {The lack of a standard formalism for expressing the
  semantics of an XML vocabulary is a major obstacle to the
  development of high-function interoperable digital libraries. XML
  document type definitions (DTDs) provide a mechanism for specifying
  the syntax of an XML vocabulary, but there is no comparable
  mechanism for specifying the semantics of that vocabulary --- where
  semantics simply means the basic facts and relationships represented
  by the occurrence of XML constructs. A substantial loss of
  functionality and interoperability in digital libraries results from
  not having a common machine-readable formalism for expressing these
  relationships for the XML vocabularies currently being used to
  encode content. Recently a number of projects and standards have
  begun taking up related topics. We describe the problem and our own
  project. }, 
}

@InProceedings{cmsmcq02:extreme,
  author = 	 {C. M. Sperberg-McQueen and David Dubin and
                  Claus Huitfeldt and Allen Renear},
  title = 	 {Drawing inferences on the basis of markup},
  booktitle = 	 {Proceedings of Extreme Markup Languages 2002},
  year =	 2002,
  editor =	 {Usdin, B. T and Newcomb, S. R.},
  address =	 {Montreal, Quebec},
  month =	 {August},
  url = {http://hdl.handle.net/2142/11844},
  abstract =     {
         Various authors have sketched out proposals for identifying
         the meaning, or guiding the automated interpretation, of
         markup, sometimes with the goal of using the information
         expressed by markup to guide the extraction of information
         from documents and using it to populate reasoning engines. We
         describe one approach to the problems of building a system to
         perform such a task.
}
}

@InProceedings{renear02:doceng,
  author = 	 {Allen Renear  and David Dubin and C. M. Sperberg-McQueen
                  and Claus Huitfeldt},
  title = 	 {Towards a Semantics for {XML} Markup},
  booktitle =	 {Proceedings of the 2002 {ACM} Symposium on Document 
                  Engineering},
  pages =	 {119-126},
  year =	 2002,
  editor =	 {R. Furuta and J. I. Maletic and E. Munson},
  address =	 {McLean, VA},
  month =	 {November},
  publisher =	 {Association for Computing Machinery},
  url =      {http://doi.acm.org/10.1145/585058.585081},
  abstract =     {

Although structured document standards provide mechanisms for
specifying, in machine-readable form, the syntax of a markup language,
there is no comparable mechanism for specifying the <em>semantics</em>
of an SGML or XML vocabulary. That is, there is no way to characterize
the meaning of markup so that the facts and relationships represented
by the occurrence of its constructs can be explicitly,
comprehensively, and mechanically identified. This has serious
practical and theoretical consequences. On the positive side, SGML/XML
constructs can be assigned arbitrary semantics and used in application
areas not foreseen by the original designers.  On the less positive
side, both content developers and application engineers must rely upon
prose documentation, or, worse, conjectures about the intention of the
markup language designer --- a process that is time-consuming,
error-prone, incomplete, and unverifiable, even when the language
designer properly documents the language. In addition, the lack of a
substantial body of research in markup semantics means that digital
document processing is undertheorized as an engineering application
area. Although there are some related projects underway (XML Schema,
RDF, the Semantic Web) which provide relevant results, none of these
projects directly and comprehensively address the core problems of
structured document semantics. This proposal characterizes the
specific problems that motivate the need for a formal semantics for
SGML/XML, describes an ongoing research project --- the BECHAMEL
Markup Semantics Project --- that is attempting to develop such a
semantics.

}
}


@InProceedings{lee99:sigir,
  author = 	 {Lee, Jonghoon and Dubin, David},
  title = 	 {Context-Sensitive Vocabulary Mapping with a Spreading 
                  Activation Network},
  booktitle = 	 {Proceedings of the 1999 ACM SIGIR Conference on Research
                  and Development in Information Retrieval},
  editor =	 {Hearst, M. and Gey, F. and Tong, R.},
  year =	 1999,
  organization = {Association for Computing Machinery},
  publisher =	 {ACM},
  address =	 {New York},
  pages =	 {198-205},
  abstract = {A spreading activation network model is applied to the
              problem of reconciling heterogeneous indexing ({STI}
              index terms and {ApJ} subject descriptors) in a database
              of documents in the field of astronomy. Drawing on
              evidence from a set of co-indexed documents, a 3-layer
              feed-forward network is constructed. It includes an
              input term layer (source vocabulary), document layer,
              and output term layer (target vocabulary). Results of
              experiments show that the network can uncover both
              static, term-to-term relationships, and those that
              depend on the context of a particular document's
              indexing. From the static mapping experiment, the
              asymmetric nature of term mapping is revealed.  A
              visualization tool graphically shows complex term
              relationships identified by this model. The
              context-sensitive mapping experiment tests the
              robustness of the network against the removal of each
              document node under testing. The performance of the
              complete network is compared to that of the reduced
              network.  The results imply that mapping is largely
              dependent on regularities emerging from the entire
              pattern of connections in the network rather than
              localist representations. The mapping from specific to
              general shows better performance than the mapping from
              general to specific. Several issues related to the model
              including limitations, application of a learning
              algorithm, and the generality of the study are
              discussed. },
url ={http://www.acm.org/pubs/citations/proceedings/ir/312624/p198-lee/}
}

@InProceedings{lee99:adass,
  author = 	 {{Lee}, Jonghoon  and {Dubin}, David S. 
                 and {Kurtz}, Michael J.},
  title = 	 {Co-occurrence Evidence for Subject Vocabulary 
                  Reconciliation in {ADS} Databases},
  booktitle = 	 {Astronomical Data Analysis Software and Systems VIII},
  editor =	 {Mehringer, D. M. and  Plante, R. L. and Roberts, D. A.},
  volume =	 172,
  series =	 {A.S.P. Conference Series},
  year =	 1999,
  publisher =	 {Astronomical Society of the Pacific},
  address =	 {San Francisco},
  pages =	 {287-290},
  abstract = {The NASA Astrophysics Data System gives astronomers all
              over the world access to over a million abstracts in the
              areas of astronomy and astrophysics, instrumentation,
              physics and geophysics. A mixture of indexing
              vocabularies has limited ADS searchers' ability to
              conduct precise subject searches, but prospects for a
              single consistent vocabulary of descriptors are
              promising. We report results of an ongoing project to
              reconcile the heterogeneous indexing (keywords, index
              terms, and subject headings) applied to the existing ADS
              records. Evidence for term mappings are sought in the
              consistent assignment of different descriptors to the
              same documents. We describe the apprehension of this
              evidence through the use a spreading activation
              network. Design and deployment of an experimental
              interface to assess these mappings is now under way. },
  url = {http://www.adass.org/adass/proceedings/adass98/leej/}
}

@InProceedings {dubin98:lisa, 
 author =      	{Dubin, David S.},
 title =       	{Addressing the Heterogeneity of Subject Indexing
               	in the {ADS} Databases},
 booktitle =   	{Library and Information Services in Astronomy {III}},
 editor =      	{Grothkopf, U. and Andernach, H. and Stevens-Rayburn, S.
                and Gomez, M.},
 volume =      	153,
 series =      	{A.S.P. Conference Series},
 year =        	1998,
 publisher =   	{Astronomical Society of the Pacific},
 address =     	{San Francisco},
 pages =       	{77-83},
 abstract = {A drawback of the current document representation scheme
             in the ADS abstract service is its heterogeneous subject
             indexing. Several related but inconsistent indexing
             languages are represented in ADS. A method of reconciling
             some indexing inconsistencies is described. Using lexical
             similarity alone, one in six of descriptors in ADS can be
             automatically mapped to some other descriptor. Analysis
             of postings data can direct administrators to those
             mergings it is most important to check for errors.},
 url = {http://www.eso.org/sci/libraries/lisa3/dubind.html}
}


@InCollection{korfhage96:sigcr,
  author = 	 {Korfhage, R. R. and Dubin, D. and Housman, E. M.},
  title = 	 {Computer-aided interactive classification: applications of 
                  {VIBE}},
  booktitle = 	 {Advances in Classification Research Volume 7},
  publisher =	 {Information Today, Inc.},
  year =	 1998,
  editor =	 {Solomon, P.},
  address =	 {Medford, NJ},
  pages =	 {83-101},
  abstract = {Tools like the {VIBE} visualization system permit human
              analysts to use both an understanding of a data set's
              content and a recognition of structure that the
              visualization reveals. But what happens when a
              database's semantics are hidden from the analyst? What
              guidelines or heuristics can he or she use to reveal the
              "correct" underlying structure? Results of two
              experiments conducted at the University of Pittsburgh
              support the claim that {VIBE} analysts can uncover a
              meaningful clustering even without semantic clues. In
              one experiment artificial data sets were created in
              which some of the variables discriminate one or more
              clusters and the others contribute only random
              noise. Variable selection guidelines based on computed
              discrimination value were used in an attempt to
              distinguish between the signal and noise variables. In a
              second experiment, a human analyst's encoding of 714
              short phrases to 23 overlapping and inter-related
              categories was stripped of meaningful titles and
              relabeled with integers. A VIBE analyst was able to
              highlight relationships among the 23 categories solely
              on the basis of co-assignment of the phrases.}  }

@InProceedings{korfhage98:spie,
  author = 	 {Korfhage, R. R. and Dubin, D. and Housman, E. M.},
  title = 	 {What good is visualization: three experiments},
  booktitle = 	 {Visual Data Exploration and Analysis {V}},
  editor =	 {Erbacher, R. F. and Pang, E.},
  year =	 1998,
  publisher =	 {{SPIE}},
  address =	 {Bellingham, WA},
  pages =	 {196-207},
  url = {http://dx.doi.org/10.1117/12.309542},
  abstract = {Three experiments demonstrate capabilities of the {VIBE}
          information retrieval interface. The first explores the
          identification of reference points sets to spread a
          collection of displayed documents into small clusters.
          The second demonstrates using visual representation of
          a document set to determine the structure of a document
          collection vis-a-vis a given reference point set when
          all semantic information has been hidden from the user.
          In the third experiment {VIBE}, in conjunction with
          genetic algorithm techniques, refined the definition of
          a {POI} (reference or query point), improving precision
          and recall.}
}

@InCollection{dubin98:sigcr,
  author = 	 {Dubin, D.},
  title = 	 {The search for structure and the search for meaning},
  booktitle = 	 {Advances in Classification Research Volume 6},
  publisher =	 {Information Today, Inc.},
  year =	 1998,
  editor =	 {Schwartz, R.},
  address =	 {Medford, NJ},
  pages =	 {13-20},
  abstract = {Statistical approaches to classification emphasize
              apprehension of structure by an analyst in a group of
              records, but issues of meaning and semantics are
              important, despite the focus on structure and
              algorithms. If meaning and semantics guide formal
              approaches to classification, can an understanding of
              structure in a group of records inform the development
              of a semantic classification scheme? Data visualization
              tools can help human analysts recognize structure and
              pattern in text and numeric records.}, }

@InProceedings{dubin96:npiv,
  author = 	 {Dubin, D. S.},
  title = 	 {Attribute Selection for Visualizing Multidimensional 
                  Document Spaces: a Progress Report},
  booktitle = 	 {Workshop on New Paradigms in Information
                  Visualization and Manipulation ({NPIV} '96)},
  editor =	 {Ebert, D. S.},
  year =	 1996,
  pages =	 {8-11}
}

@InCollection{dubin94:sigcr,
  author = 	 {Dubin, D. and Kwa\'{s}nik, B. H. and Tangmanee, C.},
  title = 	 {Elicitation Techniques for Classification Research},
  booktitle = 	 {Advances in Classification Research Volume 5},
  publisher =	 {Information Today, Inc.},
  year =	 1996,
  editor =	 {Fidel, R. and Beghtol, C. and Kwa\'{s}nik, B. H. and
                  Smith, P. J.},
  address =	 {Medford, NJ},
  pages =	 {33-68},
  abstract = {One of the greatest challenges in this field is
              designing studies that can shed light on the cognitive
              processes of classification.  The reason that this is so
              difficult is that many of these processes are
              implicit. People act on classifictory decisions without
              being aware of them.  The papers in this three-part
              section address different techniques that can be used to
              elicit information about how people order and cluster
              phenomena. The techniques discussed are: Ordered Trees,
              Repertory Grids, and Q-Sorts.  Each section will
              describe the origin of the technique and its past and
              potential applications. The use of the technique will be
              illustrated by examples from hypothetical cases or from
              actual ongoing research projects.}  }

@inproceedings(dubin95:sigir,
  author="Dubin, D.",
  title="Document analysis for visualization",
  booktitle="Proceedings of the Annual International ACM SIGIR
    Conference on Research and Development in Information Retrieval",
    pages="199-204",
  Publisher="Association for Computing Machinery",
  address="New York",
  Organization="ACM SIGIR",
  year="1995",
 url = "http://doi.acm.org/10.1145/215206.215360",
  abstract = "An experimental term selection strategy for document
visualization is described. Strong discriminators with few
co-occurrences increase the clustering tendency of low-dimensional
document browsing spaces.  Clustering tendency is tested with
diagnostic measures adapted from the field of cluster analysis, and
confirmed using the VIBE visualization tool. This method supports
browsing in high recall, low precision document retrieval and
classification tasks. " )

@Article{dubin94:TT,
  author = 	 {Dubin, D.},
  title = 	 {Applying similarity measures to texts},
  journal = 	 {{TEXT} Technology},
  year = 	 1994,
  volume =	 4,
  number =	 4,
  pages =	 {283-291}
}

@Article{olsen94:TT,
  author = 	 {Olsen, K. A. and Dubin, D.},
  title = 	 {Maintaining a personal reference library with a word 
                  processor and a scanner},
  journal = 	 {{TEXT} Technology},
  year = 	 1994,
  volume =	 4,
  number =	 2,
  pages =	 {149-154}
}

@InProceedings{song91:cadsa,
  author = 	 {Song, I. Y. and Dubin, D.},
  title = 	 {An intensional query processor in Prolog},
  booktitle = 	 {Computer Applications in Design, Simulation, and 
                  Analysis},
  editor =	 {Hamza, M. H.},
  year =	 1991,
  publisher =	 {{ACTA} Press},
  address =	 {Anaheim, CA},
  pages =	 {204-207}
}



@Article{dubin04:trends,
  author = 	 {D. Dubin},
  title = 	 {The most influential paper {Gerard} {Salton} never wrote},
  journal = 	 {Library Trends},
  year = 	 2004,
  volume =	 52,
  number =	 4,
  pages =	 {748-764},
  url = {http://hdl.handle.net/2142/1697},
  abstract = {Gerard Salton is often credited with developing the
vector space model (VSM) for information retrieval (IR). Citations to
Salton give the impression that the VSM must have been articulated as
an IR model sometime between 1970 and 1975. However, the VSM as it is
understood today evolved over a longer time period than is usually
acknowledged, and an articulation of the model and its assumptions did
not appear in print until several years after those assumptions had
been criticized and alternative models proposed. An often cited
overview paper titled "A Vector Space Model for Information Retrieval"
(alleged to have been published in 1975) does not exist, and citations
to it represent a confusion of two 1975 articles, neither of which
were overviews of the VSM as a model of information retrieval. Until
the late 1970s, Salton did not present vector spaces as models of IR
generally but rather as models of specific computations. Citations to
the phantom paper reflect an apparently widely held misconception that
the operational features and explanatory devices now associated with
the VSM must have been introduced at the same time it was first
proposed as an IR model.}, 
}

@Article{dubin09:trends,
  author = 	 {D. Dubin and J. Futrelle and J. Plutchak and J. Eke},
  title = 	 {Preserving meaning, not just objects: 
                  semantics and digital preservation},
  journal = 	 {Library Trends},
  year = 	 2009,
  volume =	 57,
  number =	 3,
  pages =	 {595-610},
  abstract = {The ECHO DEPository project is a digital preservation
              research and development project funded by the National
              Digital Information Infrastructure and Preservation
              Program (NDIIPP) and adminsitered by the Library of
              Congress. A key goal of this project is to investigate
              both practical solutions for supporting digital
              preservation activities today, and the more fundamental
              research questions underlying the development of the
              next generation of digital preservation systems. To
              support on-the-ground preservation efforts in existing
              technical and organizational environments, we have
              developed tools to help curators collect and manage
              Web-based digital resources, such as the Web Archives
              Workbench (Kaczmarek et al., 2008), and to enhance
              existing repositories' support for interoperability and
              emerging preservation standards, such as the Hub and
              Spoke Tool Suite (Habing et al., 2008). In the longer
              term, however, we recognize that successful digital
              preservation activities will require a more precise and
              complete account of the meaning of relationships within
              and among digital objects. This article describes
              project efforts to identify the core underlying semantic
              issues affecting long-term digital preservation, and to
              model how semantic inference may help next-generatio
              archives head off long term preservation risks.},
}




@InProceedings{renear08:ischools,
  author = 	 {Renear, Allen H. and David Dubin},
  title = 	 {{FRBR} as an interdisciplinary high-middle range theory},
  booktitle =	 {Proceedings of iConference 2008},
  year =	 2008,
  editor =	 {De Cenzo, Angela},
  address =	 {Los Angeles},
  abstract = {We suggest that IFLA's Functional Requirements for
              Bibliographic Records is an interesting, if unexpected,
              example of Merton's "theories of the middle range" and
              show how theoretical analysis and refinement of such
              theories can illuminate the deep interdisciplinarity of
              information science.},
  url = {http://www.ischools.org/oc/conference08/pc/PA7-3_iconf08.doc}
}

@TechReport{dubin03:obstacles,
  author = 	 {D. S. Dubin and A. Renear and C. M. Sperberg-McQueen},
  title = 	 {Addressing Obstacles to the Retrieval of Structured 
                  Documents},
  institution =  {Graduate School of Library and Information Science, 
                  University of Illinois at Urbana-Champaign},
  year = 	 2003,
  number =	 {UIUCLIS-\,-2003/1+EPRG},
  address =	 {Champaign, IL},
  url = {http://eprg.isrl.uiuc.edu/docs/uiuclis--2003-1+eprg.pdf},
  abstract = {The potential for document markup, such as SGML and XML,
to support information retrieval is receiving considerable
attention. However the generally underdetermined and implicit
nature of even the most basic semantic relationships expressed by
SGML/XML markup is an obstacle to its effective exploitation, as
is the assumption of "semantic transparency." A project
to develop a adequate machine-readable formalism for expressing
markup semantics is described, along with specific applications to
retrieval problems. A Prolog environment supporting inferences and
queries is also used to generate document abstractions based a
formalized semantics for markup vocabularies.},
}

@InCollection{dubin02:standards,
  author = 	 {Dubin, D.},
  title = 	 {Standards and Information},
  booktitle = 	 {Encyclopedia of Communication and Information},
  pages =	 {965-967},
  publisher =	 {Macmillan},
  year =	 2002,
  editor =	 {J. R. Schement},
  volume =	 3,
  address =	 {New York}
}

@TechReport{dubin99report,
  author = 	 {Dubin, D.},
  title = 	 {Toward More Robust Discrimination-Based Indexing Models},
  institution =  {Graduate School of Library and Information Science, 
                  University of Illinois at Urbana-Champaign},
  year = 	 1999,
  number =	 {UIUCLIS-\,-1999/7+IRG},
  address =	 {Champaign, IL},

  abstract = {An analysis of ``discriminatory power'' is presented
through a review and critique of Salton's Term Discrimination Value
model. The complexity and sensitivity of this influential theory
reveals different data properties which may form the basis of more
robust and reliable indexing models. Such models can better support
retrieval in large scale, interactive, multi-user, and multimedia
information systems. }, 
url = {http://citeseer.ist.psu.edu/dubin99toward.html} }

@TechReport{dubin98tdv,
  author = 	 {Dubin, D.},
  title = 	 {Further Cautions for the Calculation of Discrimination 
                  Values },
  institution =  {Graduate School of Library and Information Science, 
                  University of Illinois at Urbana-Champaign},
  year = 	 1998,
  number =	 {UIUCLIS-\,-1999/3+IRG},
  address =	 {Champaign, IL},
  abstract = {Term discrimination values computed using the cosine measure 
without first removing stop words produced results consistent with 
discrimination values computed using Euclidean distance. Under these 
conditions, terms that would ordinarily be considered poor discriminators are 
ranked among the best. Systems designers intending to use discrimination value 
for selecting or weighting document attributes (especially image, sound, or 
other non-text attributes) should note the measure's complexity and 
sensitivity.},
  url =  {http://citeseer.ist.psu.edu/dubin99further.html}    
}

@InCollection{dubin98clinic,
  author = 	 {Dubin, D.},
  title = 	 {Dimensions and Discriminability: The Role of Controlled
                  Vocabulary in Visualizing Document Associations},
  booktitle = 	 {Visualizing Subject Access for 21st Century Information
                  Resources},
  publisher =	 {University of Illinois Graduate School of Library
                  and Information Science},
  year =	 1998,
  editor =	 {Cochrane, P. A. and Johnson, E. H.},
  address =	 {Champaign, IL},
  pages =	 {39-44},
  abstract = {Visualization interfaces can improve subject access by 
highlighting the participation of document representation components in 
similarity and discrimination relationships. Within a set of retrieved 
documents, what kinds of groupings can controlled descriptors make explicit? 
Do they differ from clusters discriminated by natural language terms? The
role of applied vs. extracted descriptors in classifying search output is 
examined.
}
}


@Article{dubin97rev,
  author = 	 {Dubin, D.},
  title = 	 {Measurement in Information Science (book review)},
  journal = 	 {Journal of Classification},
  year = 	 1997,
  volume =	 14,
  number =	 2,
  pages =	 {327-330}
}

@Article{dubin96rev,
  author = 	 {Dubin, D.},
  title = 	 {Multimedia and imaging databases (book review)},
  journal = 	 {Information Processing and Management},
  year = 	 1996,
  volume =	 32,
  number =	 6,
  pages =	 {769-770}
}

@PHDTHESIS{dubin96,
   author = "Dubin, D.",
   title = "Structure in Document Browsing Spaces",
   school = "University of Pittsburgh",
   year = 1996,
 url = {http://hdl.handle.net/2142/214},
   abstract = {This study proposes and evaluates a document analysis strategy
          for information retrieval with visualization interfaces. The
          goal of document analysis is to highlight structure that helps
          searchers make their own relevance judgements, rather than to
          shift judgements from humans to machines. Searchers can
          investigate that structure with tools for visualizing
          multidimensional data.
          
          The structure of interest in this study is discrimination of
          documents into clusters. Two diagnostic measures may inform
          selection of document attributes for cluster discrimination:
          term discrimination value (TDV) and the sum of pairwise
          term-vector correlations. A series of experiments tests the
          reliability of these measures for predicting clustering
          tendency, as measured by proportion of elongated triples and
          skewness of the distribution of document dissimilarities. Term
          discrimination value was found to be inversely related to
          skewness when stopword frequencies participated in the TDV
          calculations, and inversely related to elongation when
          stopwords were excluded. Correlation sum was found to be
          directly related to skewness when stopword frequencies were
          excluded.}
}

@Article{dubin95slmq,
  author = 	 {Dubin, D.},
  title = 	 {Search Strategies for {Internet} Resources},
  journal = 	 {School Library Media Quarterly},
  year = 	 1995,
  volume =	 24,
  number =	 1,
  pages =	 {53-54}
}

@Book{spring92,
  author = 	 {Spring, M. B. and Dubin, D.},
  title = 	 {Hands-on {PostScript}},
  publisher = 	 {Hayden Books},
  year = 	 1992,
  address =	 {Carmel, IN},
  note =         {(Published in Polish translation by Intersoftland of Warsaw 
                  as {\it {PostScript} od {A} do {Z}})}
}

@Article{dubin89ties,
  author = 	 {Dubin, D.},
  title = 	 {Online databases put a new universe of resources at our 
                  fingertips.},
  journal = 	 {{TIES} Magazine},
  year = 	 1989,
  month =	 {Sept./Oct.},
  pages =	 {38-43}
}












@inproceedings{renear08:JCDL,
 author = {Renear, Allen H. and Wickett, Karen M. and Urban, Richard J. 
           and Dubin, David},
 title = {The return of the trivial: problems formalizing 
          collection/item metadata relationships},
 booktitle = {JCDL '08: Proceedings of the 8th ACM/IEEE-CS joint 
              conference on Digital libraries},
 year = 2008,
 isbn = {978-1-59593-998-2},
 pages = {464--464},
 location = {Pittsburgh PA, PA, USA},
 url = {http://doi.acm.org/10.1145/1378889.1379009},
 organization = {Association for Computing Machinery},
 publisher = {ACM},
 address = {New York, NY, USA},
 abstract = {Formalizing collection-level/item-level metadata
             relationships encounters the problem of trivial
             satisfaction. We offer a solution related to current work
             in IR and ontology evaluation.}
 }

@Misc{dubin05:DLF,
  author =	 {Dubin, D.},
  title =	 {Unpacking the Interpretation of {METS} Markup},
  howpublished = {Presented at the Digital Library Federation Fall Forum,
Charlottesville, VA},
  month =	 {November},
  year =	 2005,
  abstract = {Like most XML applications, METS, the Metadata Encoding
  and Transmission Standard, overloads a small number of generic
  syntactic relationships (e.g., parent/child) to represent a variety
  of specific semantic relationships. Human beings correctly infer the
  meaning of METS markup, and these understandings inform the logic
  and design of applications that import, export, and transform
  METS-encoded resources and descriptions.

However, METS's flexibility and generality invite diverse
interpretations, posing challenges for processing across different
METS profiles and local adaptations. Robust processing requires
support in the form of a general software library for reasoning about
METS documents. We describe the current state of development for such
a library.

This METS interpretation software is an application of the BECHAMEL
markup semantics framework. BECHAMEL applications translate properties
and relationships expressed in conventional markup into logical
assertions that unpack the overloaded XML-based syntax. The inference
problems we aim to support include identifying inline and external
storage objects, mapping storage objects to resources and
descriptions, and correctly classifying the role of namespaces.

Another goal of explicating the interpretation of METS documents is to
reserialize them in XML, directly asserting as many of the inferred
facts as we can. In this way we hope to improve prospects for long
term digital preservation.}, }

@Misc{renear08:asist,
  author =	 {Allen Renear and David Dubin and Karen Wickett},
  title =	 {When Digital Objects Change --- Exactly What Changes?},
  howpublished = {Presented at the 2008 Annual Meeting of the American
Society for Information Science and Technology, Columbus, OH},
  month =	 {October},
  year =	 2008,
  abstract = {Formal accounts of digital objects typically
              characterize them as strings, tuples, sets, graphs or
              some other constructs from discrete mathematics. Such a
              characterization implies that these objects cannot
              undergo real change, such as losing or gaining parts, or
              having their internal parts rearranged. And yet our
              discourse about digital objects seems, at least if taken
              literally, to imply that those objects routinely undergo
              real change. One strategy for dealing with this
              inconsistency is to affirm an account which leaves
              information objects immutable and re-locates change in
              the persons and communities interacting with these
              objects.},
}

@Misc{dubin04:pitt,
  author =	 {Dubin, D.},
  title =	 {Semantic markup or markup semantics?},
  howpublished = {Presented at the
University of Pittsburgh School of Information Sciences Colloquium
Series},
  month =	 {March},
  year =	 2004,
  abstract = {The Semantic Web is an active area of research and
standardization, aimed at what are perceived to be shortcomings of
current methods for representing content in digital
documents. Technologies such as RDF equip markup languages with the
functionality of general-purpose knowledge representation systems, but
the ability to put richly encoded content on the web doesn't make the
task of creating that content any easier. Furthermore, the limited
expressivity of conventional markup languages may be precisely what
makes those languages attractive to and usable by content creators.

One part of a solution to the knowledge acquisition bottleneck may lie
in research with more modest goals than the Semantic Web. The BECHAMEL
project is investigating methods for characterizing the meaning of
conventional markup so that the facts and relationships represented by
the occurrence of its constructs can be explicitly, comprehensively,
and mechanically identified. Simple fundamental semantic facts about
markup that are relied on by both markup language users and software
designers are currently left to conjectures that are error-prone,
incomplete, and unverifiable, even when the language designer properly
documents the language. Our goal is to make the meaning of document
markup sufficiently systematic, uniform, complete, and exploitable to
achieve higher levels of functionality and system interoperability.},
}

@Misc{dubin94:csna,
  author =	 {Dubin, D.},
  title =	 {{POI} discovery and the clarity of {VIBE} displays},
  howpublished = {Presented at the annual meeting of the Classification
    Society of North America, Houston, TX},
  year =	 1994,
  month =	 {June},
  abstract = {VIBE is an experimental tool for visualizing
              multivariate data.  Records are depicted with icons and
              plotted in a plane based on their relative degrees of
              association to reference points of interest (POIs) which
              represent attributes. For the past few years VIBE has
              been under development at the University of Pittsburgh
              and at dh Molde College in Norway. VIBE is a general
              tool and has little advice to offer on how its records
              are to be represented, how optimal attributes are to be
              identified, or how the display is to be interpreted by
              the user. The present investigation addresses those
              issues in an information retrieval setting, with an
              adaptation of the skewness and elongation diagnostics of
              Pruzansky, Tversky and Carroll's 1982 study on spatial
              vs. tree representations of proximity data. Using
              angular rather than Euclidean proximity measures, the
              diagnostics are sensitive to the same data properties as
              the VIBE positioning algorithm, provide a rough gauge of
              the discriminatory power of POI combinations, and help
              predict the interpretability of the VIBE display.}  }



@Misc{dubin08:ALISE,
  author =	 {Dubin, D.},
  title =	 {Challenges for Board Game Classification},
  howpublished = {Presented at the 2008 {ALISE} Annual 
                     Conference, Philadelphia, PA.},
  month =	 {January},
  year =	 2008,
  abstract = {Board and card games present particular challenges in
    reference, cataloging, and collection development, but also unique
    opportunities for librarians to provide valuable consultation to
    patrons. Compared to computer-mediated games, board games vary
    much more widely in their levels of complexity, time required to
    learn, and degree of thematic integration. A wide range of prices,
    ease of acquisition, and space requirements makes it impractical
    for most people to own the many games that can enhance their
    learning, social and family lives.

    Game packages provide some useful data for consumers (such as
    suggested age range), and the world wide board game hobby
    community has compiled impressive online databases of
    descriptions, recommendations, and reviews. But librarians need a
    richer classification system if they are to provide the best
    service to teachers, counselors, researchers, and families. We
    have begun a project aimed at developing such a classification,
    addressing dimensions such as the kinds of skills a game helps to
    develop, the ways in which historical or literary themes are
    integrated with game mechanics, and the types of social
    interaction promoted by different games.}
}

@Misc{dubin09:PCA,
  author =	 {Dubin, D.},
  title =	 {On the Expressive Content of Games},
  howpublished = {Presented at the 2009 Joint Conference of the
                  National Popular Culture and American Culture
                  Associations, New Orleans},
  month =	 {April},
  year =	 2009,
  abstract = {Games are creative works of design, and with this
              understanding we credit the people who design games, and
              respect their copyrights and/or patent rights. We
              recognize that a game may stand in derivation
              relationships with one or more other games, and that
              these inform our judgments of a game's originality and
              the prior work to which acknowledgements are owed. But
              with how much precision can we say exactly what a game
              designer has contributed? On what basis do we recognize,
              for example, that a game played on a physical surface
              and another mediated by a computer are two versions of
              the same game? If one game takes another as its basis,
              then what exactly is preserved in the derivation of the
              second from the first?

              Similar questions on the nature of representation and
              creative composition have been raised and debated. In
              the philosophy of art, Jerrold Levinson proposes a
              theory of how musical compositions come to bear esthetic
              properties. John Searle and Barry Smith debate related
              issues of social facts and collective
              intentionality. The field of library science offers a
              bibliographic model of works of authorship, their
              expressions and physical manifestations. We find that
              the multimedia character of games present special
              challenges for putting these theories and models to
              work, and also highlights the inadequacy of certain
              purely abstract accounts of games. But despite some
              differing philosophical commitments, these theories open
              windows which guide us to a richer, more complete
              understanding of game design.  } }

@Misc{dubin96:csna,
  author =	 {Dubin, D.},
  title =	 {Clustering tendency and the cluster hypothesis in 
      information retrieval},
  howpublished = {Presented at the annual meeting of the Classification
    Society of North America, Amherst, MA},
  year =	 1996,
  month =	 {June},
  abstract = {The term "clustering tendency" is used in two senses in
IR research: the degree to which document representations exhibit
groupings in a space of attributes, and the degree to which documents
relevant to the same query are similar to each other. Opinions differ
and evidence conflicts on the nature of the relationship between these
two senses of clustering. Should cluster analysis seek to reveal
objective structure in text data, or is it just a convenient way to
group related representations? Do attributes that cluster a document
space connect jointly relevant documents? Reexamining the role of
clustering tendency in IR may offer a solution to this dilemma:
cluster structure might help searchers locate relevant documents, even
if the "cluster hypothesis" is an unreasonable assumption.}  }

@Misc{dubin97:csna,
  author =	 {Dubin, D.},
  title =	 {Challenges for the Future of Document Clustering},
  howpublished = {Presented at the annual meeting of the Classification
    Society of North America, Washington, DC},
  year =	 1997,
  month =	 {June},
  note =	 {(Also presented at the 1998 meeting of the International
Federation of Classifcation Societies, Rome, Italy)},
  abstract = {A recently published evaluation of cluster-based
document retrieval results (Shaw 1997) casts twenty years of IR
research in a new light.  Shaw's study offers evidence that levels of
effectiveness typically obtained in cluster-based retrieval can be
explained on the basis of chance. If, as Shaw suggests, poor
performance in document clustering experiments reveals "weaknesses in
fundamental assumptions," then what exactly are those assumptions and
where has the research gone wrong? To what extent have IR researchers
followed recommendations in the classification literature on issues
such as selecting objects, selecting variables, and assessing
clustering tendency? What do graphical and visualization techniques
reveal about the shape of clusters IR systems are supposed to
retrieve? Do Shaw's results have implications for text classification
methods based on machine learning?  Is there a future for document
clustering as it has been employed in the past?}  }

@Misc{dubin99:csna, 
author = {Dubin, D. and Pape, D. X.}, 
title =  {Validation Strategies for Large-Scale Clustering Applications},
howpublished = {Presented at the annual meeting of the
  Classification Society of North America, Pittsburgh, PA}, 
year = 1999, 
month = {June}, 
abstract = {Kohonen's Self-Organizing Map is a
  very popular clustering method, but few published studies describe
  the application of well-known cluster validation methods to the
  SOM. A few studies have compared the SOM to other clustering
  methods, but usually with assumptions that aren't consistent with
  the ways SOMs are used in large-scale applications. Furthermore, SOM
  proponents may have the mistaken impression that problems of scale
  rule out traditional validation methods.  <p> In scaling validation
  techniques for large SOM applications, the nature of the clusters
  and how they are apprehended by the SOM are at least as important as
  algorithm complexity and the availability of computing resources. We
  describe extensions to existing algorithms for generating artificial
  test clusters, aimed at supporting validation studies for
  large-scale SOMs. We propose ways in which other validation
  techniques (such as those based on the random graph hypothesis) may
  be employed in large-scale SOM applications.} 
 }

@Misc{dubin00:csna,
  author =	 {Dubin, D. and Pape, D. X.},
  title =	 {Clustering Applications and Validation: 
                  A Case Study with the {Kohonen} {SOM}},
  howpublished = {Presented at the annual meeting of the
                  Classification Society of North America, Montreal, Canada},
  month =	 {June},
  year =	 2000,

  abstract = {We have developed an integrated suite of tools for
conducting validation studies on the popular self-organizing map.
Through this development, we have confronted issues that we believe
shed light on the reluctance of SOM researchers to adopt accepted
cluster validation methods. SOM researchers may object on the basis of
scalability issues, or on the naivet\'{e} of the presupposition of a
one-to-one mapping between clusters and SOM cells.

We discuss the architecture of our system, its components (artificial
data generation, clustering, validation) and the protocols for sharing
data among them. Our design choices have the goal of affording
scalable and realistic evaluation studies. We believe we can
contribute not only a better understanding of the SOM's strengths and
weaknesses, but also data that can improve the SOM's usability as a
tool for visualizing multivariate data.  
} 
}

@Misc{dubin01:dimacs,
  author =	 {Dubin, D. and Rorvig, M.},
  title =	 {Classical two-way metric {MDS} adapted to handle very 
                  large datasets},
  howpublished = {Presented at the {DIMACS} Workshop on Algorithms for 
                  Multidimensional Scaling, Rutgers University, 
                  Piscataway, {NJ}},
  month =	 {August},
  year =	 2001,
  abstract = {Visualization for analysis of complex phenomena is now widely
accepted.  One of these phenomena is Information Retrieval (IR). IR
plays a critical role in the myriad decisions made daily affecting all
aspects of science, technology and commerce. Visualization in IR using
MDS has led in recent years to a number of discoveries and insights.
However, the terrain of large dataset exploration remains an
unpopulated one. Parallel processing would seem a natural solution to
the problem of MDS on large data sets.  We have found examples of MDS
recast as a parallel genetic algorithm, but such approaches are
unsuitable for large data sets. We have decided to pursue a more
promising strategy: MDS as a matrix factoring problem.  This method
consists of three steps: 1) transformation of proximities into
distances; 2) conversion of distances into inner products; and 3)
factoring the scalar product matrix. Although the third stage
is where parallelism promises to make our computations scalable, it is
the first stage that may pose the most interesting challenges for
human data analysts.  A critical factor in the use of SVD is the
initial choice of document object similarity measures.  Another factor
of importance is the representation of the document features
themselves, such as image primitives, text tokens, and sound
primitives. Work that combines these features is new, but definitely
affects the similarity of documents scaled through MDS.  The effect of
these representations in the transformation of document proximities
into distances is unknown and requires empirical examination.
An additional factor in this project is the availability of a reserve
of multi-gigabyte, multilingual textual and image data.
Datasets of interest easily run to tens of thousands of objects.
Thus, a large field display is necessary to resolve the various
patterns expected to emerge in the final configuration. The
Digistar II planetarium projection facility at the University of North
Texas is available for this purpose, and is capable of resolution of
nearly 70,000 data points simultaneously.}
}

@Misc{dubin03:dimacs,
  author =	 {Dubin, D. and Ripoche, G. and Gasser, L.},
  title =	 {Organizational Dynamics of Software Development},
  howpublished = {Presented at the {DIMACS} Workshop on Algorithms for 
                  Multidimensional Scaling, Tallahassee, {FL}},
  month =	 {June},
  year =	 2003,

  abstract = {Mistakes, errors, and problems are a common element of
working life, and a part of all settings at one time or another. In
software production, the work of identifying and resolving errors,
bugs and mistakes plays a large part. As complex software artifacts
proliferate and become more central to peoples' lives, the social cost
of software errors and bugs may increase. Since errors and bugs reduce
the effectiveness with which we can build and use software systems,
we're interested in understanding how bugs come about, how they can
best be managed, and how people who build and use advanced software
systems can organize their work to prevent, overcome, deal with, and
accommodate problems. We're examining issues relating to the practice
of quality assurance in software teams; the 'normal' role of errors,
mistakes, problems, insecurity, and unreliability; and the assessment
of multiple problems, multiple constraints, and multiple actors in a
software artifact and its supporting organizational infrastructure.

Widely-accessible open source bug repositories provides an extremely
large and diverse dataset for analyzing such issues. These
repositories can be analyzed quantitatively (in terms of numbers of
events, event types, response types, dates, timelines, etc.) They can
also be analyzed qualitatively, by comparatively examining the texts
of bug reports, responses, and analyses.A third approach is
exploratory analysis of the data using methods such as classical
metric multidimensional scaling and similar visualization tools. Here
the goal is to uncover patterns, regularities, or exceptions that may
be worth investigating in detail using more formal qualitative or
quantitative methods. In the tradition of numerous bibliometric
studies we are investigating patterns of cocitation in Bugzilla, the
repository for the Mozilla development project.}, 
}

@Misc{dubin01:csna,
  author =	 {Dubin, D.},
  title =	 {Model, Data, and System Centered Perspectives in 
                  Information Retrieval Research},
  howpublished = {Presented at the annual meeting of the
                 Classification Society of North America, St. Louis, {MO}},
  month =	 {June},
  year =	 2001,
  abstract = {For decades researchers in information retrieval (IR)
have based models on vector representations that are familiar to
specialists in multivariate data analysis and classification. But
although researchers may use the same formalisms and notations for
communicating their research, close examination may reveal
disagreement on what exactly is being modeled. Such disagreements
reflect complementary approaches to research that begin with different
inquiries, permit different simplifying assumptions, and aim for
different kinds of contributions.  Some vector models support the
parsimonious reexpression of a document collection (via clusters or
latent dimensions). Some are used to explore unexpected consequences
of system design choices. But some researchers neglect to identify the
empirical entities and relationships their vector models are supposed
to represent. This talk will examine perspectives in IR research,
focusing on these contrasting interpretations. It will be argued that
examining research in their contexts speaks to current issues of
improving IR systems.  } }

@Misc{dubin02:csna,
  author =	 {Dubin, D.},
  title =	 {Do Conceptual Spaces Have Metric Structure?},
  howpublished = {Presented at the annual meeting of the
                  Classification Society of North America, Madison, {WI}},
  month =	 {June},
  year =	 2002,
  abstract = {
Information scientists have long employed the notions of "conceptual
distance" and "intellectual space" metaphorically, or for illustrative
convenience in their work on knowledge organization, indexing, and
classification. Theories of conceptual distance are implicit in the
design of some information retrieval systems, and a few scholars have
offered explicit arguments that one can define meaningful metric
distances on taxonomies or so-called semantic spaces.

What empirical relations or properties are these metrics supposed to
model, and how would one test claims that such metrics are meaningful?
Are conceptual metrics psychological theories? Are they about
regularity or pattern in documents or collections? These issues bear
on research in areas such as information visualization and knowledge
discovery in disconnected literatures.}
}

@Misc{dubin03:csna,
  author =	 {D. Dubin and J. Lee},
  title =	 {Beyond Three Dichotomies},
  howpublished = {Presented at the annual meeting of the
              Classification Society of North America, Tallahassee, {FL}},
  month =	 {June},
  year =	 2003,

  abstract = {In placing classification research studies in context,
it is often useful to situate them with respect to dichotomies, such
as supervised vs unsupervised classification, or automatic
classification vs classifications that emerge from intellectual
effort. Such contrasts need not be exclusive: cognitive science and AI
researchers note the distinction between localist and distributed
representations in their research, but acknowledge that
representations often have both localist and distributed
qualities. Could the other contrasting approaches admit similar
hybrids?

   We explore these questions with illustrations from an ongoing
subject indexing project. We have developed a system that takes as
input sets of NASA thesaurus terms assigned by professional indexers,
and maps them to sets of Astrophysical Journal subject headings. The
system is a hybrid localist/distributed connectionist network that
employs both supervised and unsupervised classification. Although the
network's operation is data-driven and automated, its source of
evidence is a database of manually assigned category labels.}, 
}

@Misc{aalc:skeletons,
  author =	 {C. M. Sperberg-McQueen and A. Renear and
                  C. Huitfeldt and D. Dubin},
  title =	 {Skeletons in the closet: Saying what markup means},
  year = 	 2002,
  month =        {July},
  howpublished = {Presented at ALLC/ACH, T\"{u}bingen,
                  Germany },
  abstract = {

   Our immediate area of concern is the problem of providing a clear,
   explicit account of the meaning and interpretation of
   markup. Scores of projects in humanities computing and elsewhere
   assume implicitly that markup is meaningful, and use its meaning to
   govern the processing of the data. While a complete account of the
   "meaning of markup" may seem daunting, at least part of this
   project appears manageable: explaining how to determine the set of
   inferences about a document which are "licensed", implicitly or
   explicitly, by its markup.  However, it proves remarkably difficult
   to find, in the literature, any straightforward account of how one
   can go about interpreting markup in such a way as to draw all and
   only the correct inferences from it.  This paper describes a
   concrete realization of one part of a model proposed earlier, and
   outlines some of the problems encountered in specifying the
   inferences licensed by commonly used DTDs. We focus here on the
   development of a notation for expressing what we call "sentence
   skeletons", or "skeleton sentences". These are sentences, either in
   English or some other natural language or in some formal notation,
   for expressing the meaning of constructs in a markup language. They
   are called sentence skeletons, rather than full sentences, because
   they have blanks at various key locations; a system for automatic
   interpretation of marked up documents will generate actual
   sentences by filling in the blanks in the sentence skeletons with
   appropriate values from the documents themselves. We describe
   theoretical and practical problems arising in using sentence
   skeletons to say what the markup in some commonly used DTDs
   actually means, in a way that allows software to generate the
   correct inferences from the markup and to exploit the information.
}
}

@Misc{dubin07:extreme,
  author =	 {D. Dubin},
  title =	 {Instance or expression? Another look at reification},
  howpublished = {Presented at Extreme Markup Languages 2007, Montreal.},
  month =	 {August},
  year =	 2007,
  abstract = {Conventional use of the RDF reification vocabulary is
        based on an understanding that triples stand in a
        type/instance relationship with "tokens" appearing in RDF
        documents. But this convention, intended to support provenance
        documentation, presents puzzles for understanding how a
        serialized expression can stand in direct relationships with
        resources referred to by an abstract triple. Recording the
        provenance of an RDF statement may be better achieved by
        reifiying the notation expressing it. The same system can then
        be applied both to RDF and to statements expressed using other
        XML applications. We illustrate with examples from a digital
        preservation research project.}
}

@Misc{dubin08:balisage,
  author =	 {David Dubin and David J. Birnbaum},
  title =	 {Reconsidering Conventional Markup 
                  for Knowledge Representation},
  howpublished = {Presented at Balisage: the Markup Conference, Montreal.},
  month =	 {August},
  year =	 2008,
  abstract = {The main attraction of semantic web technologies such as
        RDF and OWL over conventional markup is the support those
        tools provide for expressing precise semantics. Formal
        grounding for RDF-based languages (in, for example,
        description logics) and their integration with logic
        programming tools are guided and constrained by issues of
        decidability and the tractability of computations. Users of
        these technologies are invited to use less expressive
        representations, and thereby work within those
        constraints. Such compromises seem reasonable when considering
        the roles automated reasoning agents are expected to play by
        the semantic web community. But where expectations differ, it
        may be useful to reconsider using conventional markup and
        inferencing methods that have been applied with success
        despite their theoretical weaknesses. We illustrate these
        issues with a case study from manuscript studies and textual
        transmission.}  }

@Misc{downie05:ACH,
  author =	 {J. S. Downie and A. Renear and A. Mathes and
                  K. Medina and D. Dubin and J. H. Lee},
  title =	 {Modelling Complex Multimedia Relationships in the Humanities 
            Computing Context: Are {Dublin} {Core} and {FRBR} up to the Task?},
  howpublished = {Presented at ALLC/ACH, Victoria, British Columbia},
  month =	 {June},
  year =	 2005,
  url = {http://mustard.tapor.uvic.ca/cocoon/ach_abstracts/xq/pdf.xq?id=206},
  abstract = {It is now widely recognized that the creation,
  management, and analysis of content other than text is extremely
  important if the digital humanities are to deliver access to, and
  provide an analytical purchase on, the full range of human
  culture. However it is not clear to us whether the cataloguing and
  classification systems for digital content are up to the
  task. Difficulties in this area threaten to impede both the
  development of tools and techniques . and the production of sound
  theoretical results. In our paper we discuss some of these problems,
  focusing on relationships amongst the various cultural modes of
  expression. With the intention of convening a larger discussion of
  how these confusions might be remedied, we then propose directions
  for some clarification and improvement. However, the larger issues
  here are not merely terminological and resist any easy resolution.}
}

@Misc{dubin05:ACH,
  author =	 {D. Dubin and D. J. Birnbaum},
  title =	 {A Declarative Framework for Modeling Pronunciation 
                  and Rhyme},
  howpublished = {Presented at ALLC/ACH, Victoria, British Columbia},
  month =	 {June},
  year =	 2005,
  url= {http://mustard.tapor.uvic.ca/cocoon/ach_abstracts/xq/pdf.xq?id=198},

 abstract = {Encoding standards such as TEI give scholars a great deal
of flexibility in annotating texts to meet the particular needs of a
study or project. Researchers necessarily make choices about which
features of a text to highlight, what kinds of additional information
to add, and what facts are left to be inferred from other sources of
evidence apart from markup.  Among the factors to be considered in
designing or adopting text encoding procedures are the prospects for
data reuse and generalization beyond the scope of the one's current
project, investigating new questions about the texts that hadn't been
anticipated, and planning for the integration of texts using other
markup schemes.  This paper discusses considerations motivating the
ongoing design of a software framework for analysis of rhyming schemes
in 19th century Russian poetry. Our current implementation is written
in Prolog as an application of the BECHAMEL system for markup
semantics analysis. The motivation for this choice was our wish to
plan from the beginning for extensions to other encoding schemes and
generalization to other kinds of analysis.}
}

@Misc{renear08:DH,
  author =	 {Renear, Allen H. and Urban, Richard J. and Wickett, Karen M.
                  and Palmer, Carole L. and Dubin, David},
  title =	 {Sustaining Collection Value: Managing 
                  Collection/Item Metadata Relationships},
  howpublished = {Presented at Digital Humanities 2008, Oulu, Finland},
  month =	 {June},
  year =	 2008,
  url = {http://hdl.handle.net/2142/9128},
  abstract = {Many aspects of managing collection/item metadata
              relationships are critical to sustaining collection
              value over time. Metadata at the collection-level not
              only provides context for finding, understanding, and
              using the items in the collection, but is often
              essential to the particular research and scholarly
              activities the collection is designed to
              support. Contemporary retrieval systems, which search
              across collections, usually ignore collection level
              metadata. Alternative approaches, informed by
              collection-level information, will require an
              understanding of the various kinds of relationships that
              can obtain between collection-level and item-level
              metadata. This paper outlines the problem and describes
              a project that is developing a logic-based framework for
              classifying collection-level/item-level metadata
              relationships. This framework will support (i) metadata
              specification developers defining metadata elements,
              (ii) metadata librarians describing objects, and (iii)
              system designers implementing systems that help users
              take advantage of collection-level metadata.}  }

@Misc{huitfeldt06:DH,
  author =	 {Claus Huitfeldt and Michael Sperberg-McQueen and
                  David Dubin and Lars G. Johnsen},
  title =	 {Markup Languages for Complex Documents: 
                  an Interim Project Report},
  howpublished = {Presented at Digital Humanities, Paris},
  month =	 {July},
  year =	 2006,
  abstract = {For all the developments in XML since 1998, one thing
that has not changed is the understanding of XML documents as
serializations of tree structures conforming to the constraints
expressed in the document's schema. Notwithstanding XML's many
strengths, there are problem areas which invite further research on
some of the fundamental assumptions of XML and the document models
associated with it. It is a challenge to represent in XML anything
that does not easily lend itself to representation by context-free or
constituent structure grammars, such as overlapping or fragmented
elements, and multiple co-existing complete or partial alternative
structures or orderings. For the purpose of our work, we call such
structures complex structures, and we call documents containing such
structures complex documents. The MLCD (Markup Languages for Complex
Documents) project aims to integrate alternative approaches by
developing both an alternative notation, a data structure and a
constraint language which as far as possible is compatible with and
retains the strengths of XML-based markup, yet solves the problems
with representation and processing of complex structures. MLCD started
in 2001 and is expected to complete its work in 2007. The project is a
collaboration between a group of researchers based at several
different institutions.}, 
}

@Misc{dubin04:ifcs,
  author =	 {D. Birnbaum and D. Dubin},
  title =	 {Measuring Similarity in the Contents of Medieval Miscellany
                  Manuscripts},
  howpublished = {Presented at the ninth biennial meeting of the International Federation of Classification Societies, Chicago, {IL}},
  month =	 {July},
  year =	 2004,

  abstract = {Edit distance for strings is measured in terms of the
smallest number of specific stepwise modifications (insertions,
deletions, transpositions, or replacements of characters) required to
make a transition from one state to another. We describe our
experiments with an analogous measure for larger units of analysis,
namely texts copied in medieval miscellany manuscripts. The goal of
the project is to describe the likely process employed by medieval
scribes in compiling these manuscripts, and the data consists of the
manuscripts' (virtual) tables of contents. The process of compiling a
miscellany manuscript is different from the process of copying prose
text, and one inserts, deletes, transposes, or replaces constituent
texts in a miscellany manuscript differently than one inserts,
deletes, transposes, or replaces words in continuous prose. We present
an algorithm for measuring similarity that is informed by those
differences, and hierarchical cluster analyses on the contents of
approximately one hundred manuscripts.},
}

@Misc{dubin06:ifcs,
  author =	 {D. Dubin},
  title =	 {Reframing Author Cocitation Analysis},
  howpublished = {Presented at the tenth biennial meeting of the International Federation of Classification Societies, Ljubljana, Slovenia},
  month =	 {July},
  year =	 2006,
  note =	 {(also presented at the 2006 meeting of the Classification Society of North America, Piscataway, {NJ}.},
  abstract = {Author cocitation analysis (ACA) employs a variety of
clustering and scaling techniques for exploring patterns in
bibliometric data. In a 2003 JASIST article, Per Ahlgren, Bo
Jarneving, and Ronald Rousseau criticized the use of Pearson's r as a
similarity measure in ACA, citing properties that they believe make r
undesirable. Howard White's response to Ahlgren et al, and the
subsequent exchange in the JASIST letters column suggest that behind
this methodological controversy there lurk contrasting intuitions
about both the reality ACA is supposed to reveal and its relationship
to the bibliographic evidence on which ACA is based. A reframing of
the ACA data analysis methodology puts some of these controversies in
a new light.},
}

