%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % % % % A Bibliography on Automated Text Categorization % % % % updated and maintained by % % % % Evgeniy Gabrilovich % % Department of Computer Science % % Technion - Israel Institute of Technology % % Technion City, Haifa 32000, Israel % % Email: gabr AT cs . technion . ac . il % % WWW: http://www.cs.technion.ac.il/~gabr % % % % % % originally created by % % % % Fabrizio Sebastiani % % Dipartimento di Matematica Pura e Applicata % % Universita' di Padova % % Via Giovanni Battista Belzoni, 7 - 35131 Padova, Italy % % http://www.math.unipd.it/~fabseb60/ % % % % % % This is a bibliography, in BibTeX format, on automated text % % categorization (ATC), defined as the activity of automatically % % building, by means of machine learning techniques, automatic text % % classifiers, i.e. systems capable of assigning to a text % % document one or more thematic categories from a predefined set. % % % % This bibliography resides at % % http://www.cs.technion.ac.il/~gabr/resources/atc/ATCbibliography.bib % % A companion Web page is available at % % http://www.cs.technion.ac.il/~gabr/resources/atc/atcbib.html % % Everyone is welcome to download the bibliography as a whole and % % distribute it, provided that it is distributed untouched. % % % % Everyone is also welcome to let me know either additional % % references or corrections and additions (e.g. URLs, where % % they are not already present) to the existing ones. % % In general, only references specific to ATC are considered % % pertinent to this bibliography; in particular, references that % % *are* considered pertinent are: % % % % * publications that discuss novel ATC methods, novel % % experimentation of previously known methods, or resources for % % ATC experimentation; % % % % * publications that discuss applications of ATC (e.g. % % automated indexing for Boolean IR systems, filtering, etc.). % % % % References that are *not* considered pertinent are: % % % % * publications that discuss techniques in principle useful for % % ATC (e.g. machine learning techniques, information retrieval % % techniques) but do not explicitly discuss their application % % to ATC; % % % % * publications thet discuss related topics sometimes confused with % % ATC; these include, in particular, text clustering (i.e. text % % classification by unsupervised learning) and text indexing; % % % % * technical reports and workshop papers. Only papers that have % % been the object of formal publication (i.e. conferences and % % journals) are to be included in the bibliography, so as to avoid % % its explosion and the inclusion of material bound to obsolescence. % % % % Concerning URLs from which to download on-line copies of the % % papers, where possible I have included URLs with unrestricted % % access (e.g. home pages of authors). When such URLs were not % % available, sometimes a URL with restricted access (e.g. the % % ACM Digital Library or the IEEE Computing Society Digital % % Library, which are accessible to subscribers only) is indicated. % % When this is the case, if you know of a URL with unrestricted access % % from which the paper is also available, please let me know and I % % will substitute the link. % % % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % ENTRY TYPES % % Article % = an article from a journal or magazine % required: author, title, journal, year % optional: volume, number, pages, month, note % % Book % = a book with an explicit publisher % required: author or editor, title, publisher, year % optional: volume or number, series, address, edition, month, note % % Booklet % = a work that is printed and bound, but without a named publisher % or sponsoring institution % required: title % optional: author, howpublished, address, month, year, note % % Conference % = same as 'InProceedings', included for compatibility with older versions % % InBook % = a part of a book, usually untitled; it may be a chapter % (or other sectional unit) and/or range of pages % required: author or editor, title, chapter and/or pages, publisher, year % optional: volume or number, series, type, address, edition, month, note % % InCollection % = a part of a book with its own title % required: author, title, booktitle, publisher, year % optional: editor, volume or number, series, type, chapter, pages, % address, edition, month, note % % InProceedings % = an article in a conference proceedings % required: author, title, booktitle, year % optional: editor, volume or number, series, pages, address, month, % organization, publisher, note % % Manual % = technical documentation % required: title % optional: author, organization, address, edition, month, year, note % % MastersThesis % = a master's thesis % required: author, title, school, year % optional: type, address, month, note % % Misc % = use this type when nothing else fits % required: none % optional: author, title, howpublished, month, year, note % % PhDThesis % = a Ph.D. thesis % required: author, title, school, year % optional: type, address, month, note % % Proceedings % = the proceedings of a conference % required: title, year % optional: editor, volume or number, series, address, month, % organization, publisher, note % % TechReport % = a report published by a school or other institution, % usually numbered within a series % required: author, title, institution, year % optional: type, number, address, month, note % % Unpublished % = a document with an author and title, but not formally published % required: author, title, note % optional: month, year % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @inProceedings{Adam02, author = {Chai K. Adam and Hwee T. Ng and Hai L. Chieu}, title = {Bayesian Online Classifiers for Text Classification and Filtering}, booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on Research and Development in Information Retrieval}, editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng and Kalervo J{\"{a}}rvelin}, publisher = {ACM Press, New York, US}, address = {Tampere, FI}, year = {2002}, pages = {97--104}, url = {http://doi.acm.org/10.1145/564376.564395}, abstract = {This paper explores the use of Bayesian online classifiers to classify text documents. Empirical results indicate that these classifiers are comparable with the best text classification systems. Furthermore, the online approach offers the advantage of continuous learning in the batch-adaptive text filtering task.}, } @inProceedings{Adami03, author = {Giordano Adami and Paolo Avesani and Diego Sona}, title = {Bootstrapping for hierarchical document classification}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {295--302}, url = {http://doi.acm.org/10.1145/956863.956920}, abstract = {Managing the hierarchical organization of data is starting to play a key role in the knowledge management community due to the great amount of human resources needed to create and maintain these organized repositories of information. Machine learning community has in part addressed this problem by developing hierarchical supervised classifiers that help maintainers to categorize new resources within given hierarchies. Although such learning models succeed in exploiting relational knowledge, they are highly demanding in terms of labeled examples, because the number of categories is related to the dimension of the corresponding hierarchy. Hence, the creation of new directories or the modification of existing ones require strong investments.This paper proposes a semi-automatic process (interleaved with human suggestions) whose aim is to minimize (simplify) the work required to the administrators when creating, modifying, and maintaining directories. Within this process, bootstrapping a taxonomy with examples represents a critical factor for the effective exploitation of any supervised learning model. For this reason we propose a method for the bootstrapping process that makes a first hypothesis of categorization for a set of unlabeled documents, with respect to a given empty hierarchy of concepts. Based on a revision of Self-Organizing Maps, namely TaxSOM, the proposed model performs an unsupervised classification, exploiting the a-priori knowledge encoded in a taxonomy structure both at the terminological and topological level. The ultimate goal of TaxSOM is to create the premise for successfully training a supervised classifier.}, } @inProceedings{Aggarwal99, author = {Charu C. Aggarwal and Stephen C. Gates and Philip S. Yu}, title = {On the merits of building categorization systems by supervised clustering}, booktitle = {Proceedings of EDBT-00, 7th International Conference on Extending Database Technology}, publisher = {ACM Press, New York, US}, year = {1999}, address = {Konstanz, DE}, pages = {352--356}, url = {http://doi.acm.org/10.1145/312129.312279}, abstract = {This paper investigates the use of supervised clustering in order to create sets of categories for classification of documents. We use information from a pre-existing taxonomy in order to supervise the creation of a set of related clusters, though with some freedom in defining and creating the classes. We show that the advantage of using supervised clustering is that it is possible to have some control over the range of subjects that one would like the categorization system to address, but with a precise mathematical definition of each category. We then categorize documents using this a priori knowledge of the definition of each category. We also discuss a new technique to help the classifier distinguish better among closely related clusters. Finally, we show empirically that this categorization system utilizing a machine-derived taxonomy performs as well as a manual categorization process, but at a far lower cost.}, } @inProceedings{Agrawal00, author = {Rakesh Agrawal and Roberto J. Bayardo and Ramakrishnan Srikant}, title = {{\sc Athena}: Mining-based Interactive Management of Text Databases}, booktitle = {Proceedings of EDBT-00, 7th International Conference on Extending Database Technulogy}, editor = {Carlo Zaniolo and Peter C. Lockemann and Marc H. Scholl and Torsten Grust}, year = {2000}, address = {Konstanz, DE}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1777}, pages = {365--379}, url = {http://www.almaden.ibm.com/cs/people/ragrawal/papers/athena.ps}, abstract = {We describe Athena: a system for creating, exploiting, and maintaining a hierarchical arrangement of textual documents through interactive mining-based operations. Requirements of any such system include speed and minimal end-user effort. Athena satisfies these requirements through linear-time classification and clustering engines which are applied interactively to speed the development of accurate models. Naive Bayes classifiers are recognized to be among the best for classifying text. We show that our specialization of the Naive Bayes classifier is considerably more accurate (7 to 29\% absolute increase in accuracy) than a standard implementation. Our enhancements include using Lidstone's law of succession instead of Laplace's law, under-weighting long documents, and over-weighting author and subject. We also present a new interactive clustering algorithm, C-Evolve, for topic discovery. C-Evolve first finds highly accurate cluster digests (partial clusters), gets user feedback to merge and correct these digests, and then uses the classification algorithm to complete the partitioning of the data. By allowing this interactivity in the clustering process, C-Evolve achieves considerably higher clustering accuracy (10 to 20\% absolute increase in our experiments) than the popular K-Means and agglomerative clustering methods.}, } @inProceedings{Agrawal01, author = {Rakesh Agrawal and Ramakrishnan Srikant}, title = {On integrating catalogs}, booktitle = {Proceedings of WWW-01, 10th International Conference on the World Wide Web}, publisher = {ACM Press, New York, US}, editor = {}, year = {2001}, address = {Hong Kong, CN}, pages = {603--612}, url = {http://doi.acm.org/10.1145/371920.372163}, abstract = {We address the problem of integrating documents from different sources into a master catalog. This problem is pervasive in web marketplaces and portals. Current technology for automating this process consists of building a classifier that uses the categorization of documents in the master catalog to construct a model for predicting the category of unknown documents. Our key insight is that many of the data sources have their own categorization, and classification accuracy can be improved by factoring in the implicit information in these source categorizations. We show how a Naive Bayes classification can be enhanced to incorporate the similarity information present in source catalogs. Our analysis and empirical evaluation show substantial improvement in the accuracy of catalog integration.}, } @inProceedings{Aizawa00, author = {Akiko Aizawa}, title = {The feature quantity: an information-theoretic perspective of tfidf-like measures}, booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong}, publisher = {ACM Press, New York, US}, address = {Athens, GR}, year = {2000}, pages = {104--111}, url = {http://doi.acm.org/10.1145/345508.345556}, abstract = {The feature quantity, a quantitative representation of specificity introduced in this paper, is based on an information theoretic perspective of co-occurrence events between terms and documents. Mathematically, the feature quantity is defined as a product of probabillty and information, and maintains a good correspondence with the tfidf-like measures popularly used in today's IR systems. In this paper, we present a formal description of the feature quantity, as well as some illustrative examples of applying such a quantity to different types of information retrieval tasks: representative term selection and text categorization.}, } @inProceedings{Aizawa01, author = {Akiko Aizawa}, title = {Linguistic Techniques to Improve the Performance of Automatic Text Categorization}, booktitle = {Proceedings of NLPRS-01, 6th Natural Language Processing Pacific Rim Symposium}, editor = {}, publisher = {}, address = {Tokyo, JP}, year = {2001}, pages = {307--314}, url = {http://www.afnlp.org/nlprs2001/pdf/0079-01.pdf}, abstract = {This paper presents a method for incorporating natural language processing into existing text categorization procedures. Three aspects are considered in the investigation: (i) a method for weighting terms based on the concept of a probability weighted amount of information, (ii) estimation of term occurrence probabilities using a probabilistic language model, and (iii) automatic extraction of terms based on POS tags automatically generated by a morphological analyzer. The effects of these considerations are examined in the experiments using Reuters-21578 and NTCIR-J1 standard test collections.}, } @inProceedings{Alias02, author = {Francesc Al{\'i}as and Ignasi Iriondo and Pere Barnola}, title = {Multi-domain text classification for unit selection text-to-speech synthesis}, booktitle = {Proceedings of ICPhS-03, 15th International Congress on Phonetic Sciences}, address = {Barcelona, ES}, editor = {}, publisher = {}, year = {2003}, pages = {}, url = {}, abstract = {}, } @inProceedings{AlKofahi01, author = {Khalid Al-Kofahi and Alex Tyrrell and Arun Vachher and Tim Travers and Peter Jackson}, title = {Combining Multiple Classifiers for Text Categorization}, booktitle = {Proceedings of CIKM-01, 10th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Henrique Paques and Ling Liu and David Grossman}, year = {2001}, address = {Atlanta, US}, pages = {97--104}, url = {http://doi.acm.org/10.1145/502585.502603}, abstract = {A major problem facing online information services is how to index and supplement large document collections with respect to a rich set of categories. We focus upon the routing of case law summaries to various secondary law volumes in which they should be cited. Given the large number (> 13,000) of closely related categories, this is a challenging task that is unlikely to succumb to a single algorithmic solution. Our fully implemented and recently deployed system shows that a superior classification engine for this task can be constructed from a combination of classifiers. The multi-classifier approach helps us leverage all the relevant textual features and meta data, and appears to generalize to related classification tasks.}, } @inProceedings{Amati96, author = {Gianni Amati and Daniela D'Aloisi and Vittorio Giannini and Flavio Ubaldini}, title = {An Integrated System for Filtering News and Managing Distributed Data}, booktitle = {Proceedings of PAKM-96, 1st International Conference on Practical Aspects of Knowledge Management}, editor = {}, publisher = {}, year = {1996}, pages = {}, note = {An extended version appears as~\cite{Amati97b}}, address = {Basel, CH}, url = {http://airone.fub.it:8080/projects/pakm96.ps}, abstract = {With the development and diffusion of the Internet worldwide connection, a large amount of information can be delivered to the users. To avoid their being overflowed by the incoming data, methods of information filtering are required. Thus, there is the problem of determining what information is relevant to the user and how this decision can be taken by a supporting system. Parametric and qualitative descriptors of user's interest must be generated. This paper presents two approaches. The first concerns an information filtering system based on an adaptation of the generalized probabilistic model of information retrieval. The user profile is a vector of weighted terms which are learned from the relevance assessment values given by the user on the training set. Positive terms are considered relevant to the informative need of the user, negative ones irrelevant. The relevance values are interpreted as subjective probabilities and hence are mapped into the real interval [0; 1]. ProFile is a filtering system for the netnews which uses this model with a scale of 11 predefined values of relevance. ProFile allows the user to update on-line his profile and to check the discrepancy between his assessment and the prediction of relevance of the system. The second concerns the InfoAgent, a system for supporting users in retrieving data in distributed and heterogeneous archives and repositories. The architecture is based on the metaphor of the software agents and incorporates innovative hints from other fields: distributed architectures, relevance feedback and active interfaces. The system has a cooperative and supportive role: it understands the user's needs and learns from his behavior. Its aim is to disengage the user from learning complex tools and from performing tedious and repetitive actions.}, } @inProceedings{Amati97, author = {Gianni Amati and Fabio Crestani and Flavio Ubaldini}, title = {A learning system for selective dissemination of information}, booktitle = {Proceedings of IJCAI-97, 15th International Joint Conference on Artificial Intelligence}, editor = {Martha E. Pollack}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = {1997}, pages = {764--769}, address = {Nagoya, JP}, url = {http://www.cs.strath.ac.uk/~fabioc/papers/97-ijcai.pdf}, abstract = {New methods and new systems are needed to filter or to selectively distribute the increasing volume of electronic information being produced nowadays. An effective information filtering system is one that provides the exact information that fulfills a user's interest with the minimum effort by the user to describe it. Such a system will have to be adaptive to the user changing interest. In this paper we present a learning system for information filtering and selective information dissemination. The learning algorithm is described and the effectiveness of the system is evaluated in a true information filtering style.}, } @inProceedings{Amati97a, author = {Gianni Amati and Fabio Crestani and Flavio Ubaldini and Stefano De Nardis}, title = {Probabilistic Learning for Information Filtering}, booktitle = {Proceedings of RIAO-97, 1st International Conference ``Recherche d'Information Assistee par Ordinateur''}, editor = {Luc Devroye and Claude Chrisment}, address = {Montreal, CA}, year = {1997}, pages = {513--530}, note = {An extended version appears as~\cite{Amati99}}, url = {http://www.cs.strath.ac.uk/~fabioc/papers/97-riao.pdf}, abstract = {In this paper we describe and evaluate a learning model for information filtering which is an adaptation of the generalised probabilistic model of Information Retrieval. The model is based on the concept of ``uncertainty sampling'', a technique that allows for relevance feedback both on relevant and non relevant documents. The proposed learning model is the core of a prototype information filtering system called ProFile.}, } @article{Amati97b, author = {Gianni Amati and Daniela D'Aloisi and Vittorio Giannini and Flavio Ubaldini}, title = {A Framework for Filtering News and Managing Distributed Data}, journal = {Journal of Universal Computer Science}, year = {1997}, number = {8}, volume = {3}, pages = {1007--1021}, url = {http://www.jucs.org/jucs_3_8/a_framework_for_filtering}, abstract = {With the development and diffusion of the Internet worldwide connection, a large amount of information is available to the users. Methods of information filtering and fetching are then required. This paper presents two approaches. The first concerns the information filtering system ProFile based on an adaptation of the generalized probabilistic model of information retrieval. ProFile filters the netnews and uses a scale of 11 predefined values of relevance. ProFile allows the user to update on-line the profile and to check the discrepancy between the assessment and the prediction of relevance of the system. The second concerns ABIS, an intelligent agent for supporting users in filtering data from distributed and heterogeneous archives and repositories. ABIS minimizes user's effort in selecting the huge amount of available documents. The filtering engine memorizes both user preferences and past situations. ABIS compares documents with the past situations and finds the similarity scores on the basis of a memory-based reasoning approach.}, } @article{Amati99, author = {Gianni Amati and Fabio Crestani}, title = {Probabilistic learning for selective dissemination of information}, journal = {Information Processing and Management}, pages = {633--654}, year = {1999}, number = {5}, volume = {35}, url = {http://www.cs.strath.ac.uk/~fabioc/papers/99-ipem.pdf}, abstract = {New methods and new systems are needed to filter or to selectively distribute the increasing volume of electronic information being produced nowadays. An effective information filtering system is one that provides the exact information that fulfills user's interests with the minimum effort by the user to describe it. Such a system will have to be adaptive to the user changing interest. In this paper we describe and evaluate a learning model for information filtering which is an adaptation of the generalized probabilistic model of Information Retrieval. The model is based on the concept of `uncertainty sampling', a technique that allows for relevance feedback both on relevant and nonrelevant documents. The proposed learning model is the core of a prototype information filtering system called ProFile.}, } @inProceedings{Androutsopoulos00, author = {Ion Androutsopoulos and John Koutsias and Konstandinos V. Chandrinos and Constantine D. Spyropoulos}, title = {An experimental comparison of naive Bayesian and keyword-based anti-spam filtering with personal e-mail messages}, booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong}, publisher = {ACM Press, New York, US}, address = {Athens, GR}, year = {2000}, pages = {160--167}, url = {http://doi.acm.org/10.1145/345508.345569}, abstract = {The growing problem of unsolicited bulk e-mail, also known as ``spam'', has generated a need for reliable anti-spam e-mail filters. Filters of this type have so far been based mostly on manually constructed keyword patterns. An alternative approach has recently been proposed, whereby a Naive Bayesian classifier is trained automatically to detect spam messages. We test this approach on a large collection of personal e-mail messages, which we make publicly available in "encrypted" form contributing towards standard benchmarks. We introduce appropriate cost-sensitive measures, investigating at the same time the effect of attribute-set size, training-corpus size, lemmatization, and stop lists, issues that have not been explored in previous experiments. Finally, the Naive Bayesian filter is compared, in terms of performance, to a filter that uses keyword patterns, and which is part of a widely used e-mail reader.}, } @article{Appiani01, author = {Enrico Appiani and Francesca Cesarini and Annamaria Colla and Massimiliano Diligenti and Marco Gori and Simone Marinai and Giovanni Soda}, title = {Automatic document classification and indexing in high-volume applications}, journal = {International Journal on Document Analysis and Recognition}, year = {2001}, number = {2}, volume = {4}, pages = {69--83}, url = {http://link.springer-ny.com/link/service/journals/10032/papers/1004002/10040069.pdf}, abstract = {In this paper a system for analysis and automatic indexing of imaged documents for high-volume applications is described. This system, named STRETCH (STorage and RETrieval by Content of imaged documents), is based on an Archiving and Retrieval Engine, which overcomes the bottleneck of document profiling bypassing some limitations of existing pre-defined indexing schemes. The engine exploits a structured document representation and can activate appropriate methods to characterise and automatically index heterogeneous documents with variable layout. The originality of STRETCH lies principally in the possibility for unskilled users to define the indexes relevant to the document domains of their interest by simply presenting visual examples and applying reliable automatic information extraction methods (document classification, flexible reading strategies) to index the documents automatically, thus creating archives as desired. STRETCH offers ease of use and application programming and the ability to dynamically adapt to new types of documents. The system has been tested in two applications in particular, one concerning passive invoices and the other bank documents. In these applications, several classes of documents are involved. The indexing strategy first automatically classifies the document, thus avoiding pre-sorting, then locates and reads the information pertaining to the specific document class. Experimental results are encouraging overall; in particular, document classification results fulfill the requirements of high-volume application. Integration into production lines is under execution.}, } @article{Apte94, author = {Apt\'{e}, Chidanand and Damerau, Fred J. and Weiss, Sholom M.}, title = {Automated learning of decision rules for text categorization}, journal = {ACM Transactions on Information Systems}, year = {1994}, number = {3}, volume = {12}, pages = {233--251}, url = {http://www.acm.org/pubs/articles/journals/tois/1994-12-3/p233-apte/p233-apte.pdf}, abstract = {We describe the results of extensive experiments using optimized rule-based induction methods on large document collections. The goal of these methods is to discover automatically classification patterns that can be used for general document categorization or personalized filtering of free text. Previous reports indicate that human-engineered rule-based systems, requiring many man-years of developmental efforts, have been successfully built to ``read'' documents and assign topics to them. We show that machine-generated decision rules appear comparable to human performance, while using the identical rule-based representation. In comparison with other machine-learning techniques, results on a key benchmark from the Reuters collection show a large gain in performance, from a previously reported 67\% recall/precision breakeven point to 80.5\%. In the context of a very high-dimensional feature space, several methodological alternatives are examined, including universal versus local dictionaries, and binary versus frequency related features.}, } @inProceedings{Apte94a, author = {Apt\'{e}, Chidanand and Damerau, Fred J. and Weiss, Sholom M.}, title = {Towards Language-Independent Automated Learning of Text Categorization Models}, booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Van Rijsbergen, Cornelis J.}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Dublin, IE}, pages = {23--30}, year = {1994}, note = {An extended version appears as~\cite{Apte94}}, url = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p23-apte/p23-apte.pdf}, abstract = {We describe the results of extensive machine learning experiments on large collections of Reuters' English and German newswires. The goal of these experiments was to automatically discover classification patterns that can be used for assignment of topics to the individual newswires. Our results with the English newswire collection show a very large gain in performance as compared to published benchmarks, while our initial results with the German newswires appear very promising. We present our methodology, which seems to be insensitive to the language of the document collections, and discuss issues related to the differences in results that we have obtained for the two collections.}, } @article{Attardi98, author = {Attardi, Giuseppe and Di Marco, Sergio and Salvi, Davide}, title = {Categorization by context}, journal = {Journal of Universal Computer Science}, year = {1998}, number = {9}, volume = {4}, pages = {719--736}, url = {http://www.jucs.org/jucs_4_9/categorisation_by_context}, abstract = {Assistance in retrieving of documents on the World Wide Web is provided either by search engines, through keyword based queries, or by catalogues, which organise documents into hierarchical collections. Maintaining catalogues manually is becoming increasingly difficult due to the sheer amount of material on the Web, and therefore it will be soon necessary to resort to techniques for automatic classification of documents. Classification is traditionally performed by extracting information for indexing a document from the document itself. The paper describes the technique of categorisation by context, which exploits the context perceivable from the structure of HTML documents to extract useful information for classifying the documents they refer to. We present the results of experiments with a preliminary implementation of the technique.}, } @inProceedings{Attardi99, author = {Giuseppe Attardi and Antonio Gull{\'{\i}} and Fabrizio Sebastiani}, title = {Automatic Web Page Categorization by Link and Context Analysis}, booktitle = {Proceedings of THAI-99, 1st European Symposium on Telematics, Hypermedia and Artificial Intelligence}, editor = {Chris Hutchison and Gaetano Lanzarone}, year = {1999}, address = {Varese, IT}, pages = {105--119}, url = {http://www.math.unipd.it/~fabseb60/Publications/THAI99.pdf}, abstract = {Assistance in retrieving documents on the World Wide Web is provided either by search engines, through keyword-based queries, or by catalogues, which organize documents into hierarchical collections. Maintaining catalogues manually is becoming increasingly difficult, due to the sheer amount of material on the Web; it is thus becoming necessary to resort to techniques for the automatic classification of documents. Automatic classification is traditionally performed by extracting the information for representing a document (``indexing'') from the document itself. The paper describes the novel technique of categorization by context, which instead extracts useful information for classifying a document from the context where a URL referring to it appears. We present the results of experimenting with Theseus, a classifier that exploits this technique.}, } @inProceedings{Avancini03, author = {Henri Avancini and Alberto Lavelli and Bernardo Magnini and Fabrizio Sebastiani and Roberto Zanoli}, title = {Expanding Domain-Specific Lexicons by Term Categorization}, year = {2003}, booktitle = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing}, address = {Melbourne, US}, publisher = {ACM Press, New York, US}, pages = {793--797}, url = {http://www.math.unipd.it/~fabseb60/Publications/SAC03c.pdf}, abstract = {We discuss an approach to the automatic expansion of domain-specific lexicons by means of \emph{term categorization}, a novel task employing techniques from information retrieval (IR) and machine learning (ML). Specifically, we view the expansion of such lexicons as a process of learning previously unknown associations between terms and \emph{domains}. The process generates, for each $c_{i}$ in a set $C=\{c_{1},\ldots,c_{m}\}$ of domains, a lexicon $L^{i}_{1}$, bootstrapping from an initial lexicon $L^{i}_{0}$ and a set of documents $\theta$ given as input. The method is inspired by \emph{text categorization} (TC), the discipline concerned with labelling natural language texts with labels from a predefined set of domains, or categories. However, while TC deals with documents represented as vectors in a space of terms, we formulate the task of term categorization as one in which terms are (dually) represented as vectors in a space of documents, and in which terms (instead of documents) are labelled with domains.}, } @inProceedings{Baker98, author = {Douglas Baker and Andrew K. McCallum}, title = {Distributional clustering of words for text classification}, booktitle = {Proceedings of the 21st ACM International Conference on Research and Development in Information Retrieval}, editor = {Bruce Croft and Alistair Moffat and Van Rijsbergen, Cornelis J. and Ross Wilkinson and Justin Zobel}, publisher = {ACM Press, New York, US}, year = {1998}, address = {Melbourne, AU}, pages = {96--103}, url = {http://www.cs.cmu.edu/~mccallum/papers/clustering-sigir98.ps.gz}, abstract = {We describe the application of distributional clustering to document classification. This approach clusters words into groups based on the distribution of class labels associated with each word. Thus, unlike some other unsupervised dimensionality-reduction techniques, such as latent semantic indexing, we are able to compress the feature space much more aggressively, while still maintaining high document classification accuracy. Experimental results obtained on three real-world data sets show that we can reduce the feature dimensionality by three orders of magnitude and lose only 2\% accuracy, significantly better than latent semantic indexing, class-based clustering, feature selection by mutual information, or Markov-blanket-based feature selection. We also show that less aggressive clustering sometimes results in improved classification accuracy over classification without clustering.}, } @inProceedings{Bao01, author = {Yongguang Bao and Satoshi Aoyama and Xiaoyong Du and Kazutaka Yamada and Naohiro Ishii}, title = {A Rough Set-Based Hybrid Method to Text Categorization}, booktitle = {Proceedings of WISE-01, 2nd International Conference on Web Information Systems Engineering}, editor = {M. Tamer {\"O}zsu and Hans-J{\"{o}}rg Schek and Katsumi Tanaka and Yanchun Zhang and Yahiko Kambayashi}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, year = {2001}, address = {Kyoto, JP}, pages = {254--261}, url = {}, abstract = {In this paper we present a hybrid text categorization method based on Rough Sets theory. A central problem in good text Classification for information filtering and retrieval (IF/IR) is the high dimensionality of the data. It may contain many unnecessary and irrelevant features. To cope with this problem, we propose a hybrid technique using Latent Semantic Indexing (LSI) and Rough Sets theory (RS) to alleviate this situation. Given corpora of documents and a training set of examples of classified documents, the technique locates a minimal set of co-ordinate keywords to distinguish between classes of documents, reducing the dimensionality of the keyword vectors. This simplifies the creation of knowledge-based IF/IR systems, speeds up their operation, and allows easy editing of the rule bases employed. Besides, we generate several knowledge base instead of one knowledge base for the classification of new object, hoping that the combination of answers of the multiple knowledge bases result in better performance. Multiple knowledge bases can be formulated precisely and in a unified way within the framework of RS. This paper describes the proposed technique, discusses the integration of a keyword acquisition algorithm, Latent Semantic Indexing (LSI) with Rough Set-based rule generate algorithm, and provides experimental results. The test results show the hybrid method is better than the previous rough set-based approach.}, } @inProceedings{Basili00, author = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza}, title = {Language-Sensitive Text Classification}, booktitle = {Proceedings of RIAO-00, 6th International Conference ``Recherche d'Information Assistee par Ordinateur''}, editor = {}, address = {Paris, France}, year = {2000}, pages = {331--343}, url = {}, abstract = {It is a traditional belief that in order to scale-up to more effective retrieval and access methods modern Information Retrieval has to consider more the text content. The modalities and techniques to fit this objectives are still under discussion. More empirical evidence is required to determine the suitable linguistic levels for modeling each IR subtask (e.g. information zoning, parsing, feature selection for indexing,...) and the corresponding use of this information. In this paper an original classification model sensitive to document syntactic information and characterized by a novel inference method is described. Extensive experimental evidence has been derived on real test data and also from well-established academic test sets. The results show that a significant improvement can be derived using the proposed inference model. Also the role of linguistic preprocessing seems to provide positive effects on the performance. POS tagging and recognition of Proper Nouns received a specific experimental attention and provided significant effects on measured accuracy.}, } @inProceedings{Basili01, author = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza}, title = {NLP-driven IR: Evaluating Performances over a Text Classification task}, booktitle = {Proceeding of IJCAI-01, 17th International Joint Conference on Artificial Intelligence}, editor = {Bernhard Nebel}, address = {Seattle, US}, year = {2001}, pages = {1286--1291}, url = {}, abstract = {Although several attempts have been made to introduce Natural Language Processing (NLP) techniques in Information Retrieval, most ones failed to prove their effectiveness in increasing performances. In this paper Text Classification (TC) has been taken as the IR task and the effect of linguistic capabilities of the underlying system have been studied. A novel model for TC, extending a well know statistical model (i.e. Rocchio's formula [Ittner et al., 1995]) and applied to linguistic features has been defined and experimented. The proposed model represents an effective feature selection methodology. All the experiments result in a significant improvement with respect to other purely statistical methods (e.g. [Yang, 1999]), thus stressing the relevance of the available linguistic information. Moreover, the derived classifier reachs the performance (about 85\%) of the best known models (i.e. Support Vector Machines (SVM) and k-Nearest Neighbour (KNN)) characterized by an higher computational complexity for training and processing.}, } @inProceedings{Basili01a, author = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza}, title = {An hybrid approach to optimize feature selection process in text classification}, booktitle = {Proceedings of AI*IA-01, 7th Congress of the Italian Association for Artificial Intelligence}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2175}, editor = {Floriana Esposito}, year = {2001}, pages = {320--325}, address = {Bari, IT}, url = {http://link.springer.de/link/service/series/0558/papers/2175/21750320.pdf}, abstract = {Feature selection and weighting are the primary activity of every learning algorithm for text classification. Traditionally these tasks are carried out individually in two distinct phases: the first is the global feature selection during a corpus pre-processing and the second is the application of the feature weighting model. This means that two (or several) different techniques are used to optimize the performances even if a single algorithm may have more chances to operate the right choices. When the complete feature set is available, the classifier learning algorithm can better relate to the suitable representation level the different complex features like linguistic ones (e.g. syntactic categories associated to words in the training material or terminological expressions). In [3] it has been suggested that classifiers based on generalized Rocchio formula can be used to weight features in category profiles in order to exploit the selectivity of linguistic information techniques in text classification. In this paper, a systematic study aimed to understand the role of Rocchio formula in selection and weighting of linguistic features will be described.}, } @inProceedings{Basili01b, author = {Roberto Basili and Alessandro Moschitti}, title = {A robust model for intelligent text classification}, booktitle = {Proceedings of ICTAI-01, 13th IEEE International Conference on Tools with Artificial Intelligence}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {}, year = {2001}, pages = {265--272}, address = {Dallas, US}, url = {}, abstract = {Methods for taking into account linguistic content into text retrieval are receiving a growing attention [16],[14]. Text categorization is an interesting area for evaluating and quantifying the impact of linguistic information. Works in text retrieval through Internet suggest that embedding linguistic information at a suitable level within traditional quantitative approaches (e.g. sense distinctions for query expansion as in [14]) is the crucial issue able to bring the experimental stage to operational results. This kind of representational problem is also studied in this paper where traditional methods for statistical text categorization are augmented via a systematic use of linguistic information. Again, as in [14], the addition of NLP capabilities also suggested a different application of existing methods in revised forms. This paper presents an extension of the Rocchio formula [11] as a feature weighting and selection model used as a basis for multilingual Information Extraction. It allows an effective exploitation of the available linguistic information that better emphasizes this latter with significant both data compression and accuracy. The results is an original statistical classifier fed with linguistic (i.e. more complex) features and characterized by the novel feature selection and weighting model. It outperforms existing systems by keeping most of their interesting properties (i.e. easy implementation, low complexity and high scalability). Extensive tests of the model suggest its application as a viable and robust tool for large scale text classification and filtering, as well as a basic module for more complex scenarios.}, } @article{Bayer98, author = {Thomas Bayer and Ulrich Kressel and Heike Mogg-Schneider and Ingrid Renz}, title = {Categorizing paper documents. A generic system for domain and language independent text categorization}, journal = {Computer Vision and Image Understanding}, year = {1998}, number = {3}, volume = {70}, pages = {299--306}, url = {http://www.idealibrary.com/links/doi/10.1006/cviu.1998.0687/pdf}, abstract = {Text categorization assigns predefined categories to either electronically available texts or those resulting from document image analysis. A generic system for text categorization is presented which is based on statistical analysis of representative text corpora. Significant features are automatically derived from training texts by selecting substrings from actual word forms and applying statistical information and general linguistic knowledge. The dimension of the feature vectors is then reduced by linear transformation, keeping the essential information. The classification is a minimum least-squares approach based on polynomials. The described system can be efficiently adapted to new domains or different languages. In application, the adapted text categorizers are reliable, fast, and completely automatic. Two example categorization tasks achieve recognition scores of approximately 80\% and are very robust against recognition or typing errors.}, } @inProceedings{Bekkerman01, author = {Ron Bekkerman and Ran El-Yaniv and Naftali Tishby and Yoad Winter}, title = {On Feature Distributional Clustering for Text Categorization}, booktitle = {Proceedings of SIGIR-01, 24th ACM International Conference on Research and Development in Information Retrieval}, editor = {Croft, W. Bruce and Harper, David J. and Kraft, Donald H. and Zobel, Justin}, publisher = {ACM Press, New York, US}, address = {New Orleans, US}, year = {2001}, pages = {146--153}, url = {http://www.cs.huji.ac.il/labs/learning/Papers/sigir.ps.gz}, abstract = {We describe a text categorization approach that is based on a combination of feature distributional clusters with a support vector machine (SVM) classifier. Our feature selection approach employs distributional clustering of words via the recently introduced information bottleneck method, which generates a more efficient word-cluster representation of documents. Combined with the classification power of an SVM, this method yields high performance text categorization that can outperform other recent methods in terms of categorization accuracy and representation efficiency. Comparing the accuracy of our method with other techniques, we observe significant dependency of the results on the data set. We discuss the potential reasons for this dependency.}, } @article{Bekkerman03, author = {Ron Bekkerman and Ran El-Yaniv and Naftali Tishby and Yoad Winter}, title = {Distributional word clusters vs.\ words for text categorization}, journal = {Journal of Machine Learning Research}, volume = {3}, pages = {1183--1208}, year = {2003}, url = {http://www.jmlr.org/papers/volume3/bekkerman03a/bekkerman03a.pdf}, abstract = {We study an approach to text categorization that combines distributional clustering of words and a Support Vector Machine (SVM) classifier. This word-cluster representation is computed using the recently introduced Information Bottleneck method, which generates a compact and efficient representation of documents. When combined with the classification power of the SVM, this method yields high performance in text categorization. This novel combination of SVM with word-cluster representation is compared with SVM-based categorization using the simpler bag-of-words (BOW) representation. The comparison is performed over three known datasets. On one of these datasets (the 20 Newsgroups) the method based on word clusters significantly outperforms the word-based representation in terms of categorization accuracy or representation efficiency. On the two other sets (Reuters-21578 and WebKB) the word-based representation slightly outperforms the word-cluster representation. We investigate the potential reasons for this behavior and relate it to structural differences between the datasets.}, } @inProceedings{Bel03, author = {Nuria Bel and Cornelis H. Koster and Marta Villegas}, title = {Cross-lingual text categorization}, booktitle = {Proceedings of ECDL-03, 7th European Conference on Research and Advanced Technology for Digital Libraries}, editor = {Traugott Koch and Torvik S{\o}lvberg, Ingeborg}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2769}, year = {2003}, address = {Trondheim, NO}, pages = {126--139}, url = {}, abstract = {}, } @article{Benkhalifa01, author = {Mohammed Benkhalifa and Abdelhak Mouradi and Houssaine Bouyakhf}, title = {Integrating External Knowledge to Supplement Training Data in Semi-Supervised Learning for Text Categorization}, journal = {Information Retrieval}, number = {2}, volume = {4}, pages = {91--113}, year = {2001}, url = {http://www.wkap.nl/article.pdf?351286}, abstract = {Text Categorization (TC) is the automated assignment of text documents to predefined categories based on document contents. TC has been an application for many learning approaches, which prove effective. Nevertheless, TC provides many challenges to machine learning. In this paper, we suggest, for text categorization, the integration of external WordNet lexical information to supplement training data for a semi-supervised clustering algorithm which can learn from both training and test documents to classify new unseen documents. This algorithm is the ``Semi-Supervised Fuzzy c-Means'' (ssFCM). Our experiments use Reuters 21578 database and consist of binary classifications for categories selected from the 115 TOPICS classes of the Reuters collection. Using the Vector Space Model, each document is represented by its original feature vector augmented with external feature vector generated using WordNet. We verify experimentally that the integration of WordNet helps ssFCM improve its performance, effectively addresses the classification of documents into categories with few training documents and does not interfere with the use of training data.}, } @article{Benkhalifa01a, author = {Mohammed Benkhalifa and Abdelhak Mouradi and Houssaine Bouyakhf}, title = {Integrating {WordNet} knowledge to supplement training data in semi-supervised agglomerative hierarchical clustering for text categorization}, journal = {International Journal of Intelligent Systems}, pages = {929--947}, year = {2001}, volume = {16}, number = {8}, url = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=84503376&PLACEBO=IE.pdf}, abstract = {The text categorization (TC) is the automated assignment of text documents to predefined categories based on document contents. TC has been an application for many learning approaches. which proved effective. Nevertheless, TC provides many challenges to machine learning. In this paper. we suggest, for text categorization, the integration of external WordNet lexical information to supplement training data for a semi-supervised clustering algorithm which (i) uses a finite design set of labeled data to (ii) help agglomerative hierarchical clustering algorithms (AHC) partition a finite set of unlabeled data and then (iii) terminates without the capacity to classify other objects. This algorithm is the "semi-supervised agglomerative hierarchical clustering algorithm" (ssAHC). Our experiments use Reuters 21578 database and consist of binary classifications for categories selected from the 89 TOPICS classes of the Reuters collection. Using the vector space model (VSM), each document is represented by its original feature vector augmented with external feature vector generated using WordNet. We verify experimentally that the integration of WordNet helps ssAHC improve its performance, effectively addresses the classification of documents into categories with few training documents. and does not interfere with the use of training data.}, } @inProceedings{Benkhalifa99, author = {Benkhalifa, Mohamed and Bensaid, Amine and Mouradi, Abdelhak}, title = {Text categorization using the semi-supervised fuzzy c-means algorithm}, booktitle = {Proceedings of NAFIPS-99, 18th International Conference of the North American Fuzzy Information Processing Society}, address = {New York, US}, pages = {561--565}, year = {1999}, url = {}, abstract = {Text categorization (TC) is the automated assignment of text documents to predefined categories based on document contents. TC has become very important in the information retrieval area, where information needs have tremendously increased with the rapid growth of textual information sources such as the Internet. We compare, for text categorization, two partially supervised (or semi-supervised) clustering algorithms: the Semi-Supervised Agglomerative Hierarchical Clustering (ssAHC) algorithm (A. Amar et al., 1997) and the Semi-Supervised Fuzzy-c-Means (ssFCM) algorithm (M. Amine et al., 1996). This (semi-supervised) learning paradigm falls somewhere between the fully supervised and the fully unsupervised learning schemes, in the sense that it exploits both class information contained in labeled data (training documents) and structure information possessed by unlabeled data (test documents) in order to produce better partitions for test documents. Our experiments, make use of the Reuters 21578 database of documents and consist of a binary classification for each of the ten most populous categories of the Reuters database. To convert the documents into vector form, we experiment with different numbers of features, which we select, based on an information gain criterion. We verify experimentally that ssFCM both outperforms and takes less time than the Fuzzy-c-Means (FCM) algorithm. With a smaller number of features, ssFCM's performance is also superior to that of ssAHC's. Finally ssFCM results in improved performance and faster execution time as more weight is given to training documents.}, } @inProceedings{Bennett02, author = {Paul N. Bennett and Susan T. Dumais and Eric Horvitz}, title = {Probabilistic combination of text classifiers using reliability indicators: models and results}, booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on Research and Development in Information Retrieval}, editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng and Kalervo J{\"{a}}rvelin}, publisher = {ACM Press, New York, US}, address = {Tampere, FI}, year = {2002}, pages = {207--214}, url = {http://doi.acm.org/10.1145/564376.564413}, abstract = {The intuition that different text classifiers behave in qualitatively different ways has long motivated attempts to build a better metaclassifier via some combination of classifiers. We introduce a probabilistic method for combining classifiers that considers the context-sensitive reliabilities of contributing classifiers. The method harnesses reliability indicators---variables that provide a valuable signal about the performance of classifiers in different situations. We provide background, present procedures for building metaclassifiers that take into consideration both reliability indicators and classifier outputs, and review a set of comparative studies undertaken to evaluate the methodology.}, } @inProceedings{Bennett03, author = {Paul N. Bennett}, title = {Using asymmetric distributions to improve text classifier probability estimates}, booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on Research and Development in Information Retrieval}, editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David Hawking and Alan Smeaton}, publisher = {ACM Press, New York, US}, address = {Toronto, CA}, year = {2003}, pages = {111--118}, url = {http://doi.acm.org/10.1145/860435.860457}, abstract = {Text classifiers that give probability estimates are more readily applicable in a variety of scenarios. For example, rather than choosing one set decision threshold, they can be used in a Bayesian risk model to issue a run-time decision which minimizes a user-specified cost function dynamically chosen at prediction time. However, the quality of the probability estimates is crucial. We review a variety of standard approaches to converting scores (and poor probability estimates) from text classifiers to high quality estimates and introduce new models motivated by the intuition that the empirical score distribution for the "extremely irrelevant", "hard to discriminate", and "obviously relevant" items are often significantly different. Finally, we analyze the experimental performance of these models over the outputs of two text classifiers. The analysis demonstrates that one of these models is theoretically attractive (introducing few new parameters while increasing flexibility), computationally efficient, and empirically preferable.}, } @article{Bennett05, author = {Paul N. Bennett and Susan T. Dumais and Eric Horvitz}, title = {The Combination of Text Classifiers Using Reliability Indicators}, journal = {Information Retrieval}, number = {1}, volume = {8}, pages = {67--100}, year = {2005}, url = {http://www.kluweronline.com/issn/1386-4564}, abstract = {The intuition that different text classifiers behave in qualitatively different ways has long motivated attempts to build a better metaclassifier via some combination of classifiers. We introduce a probabilistic method for combining classifiers that considers the context-sensitive reliabilities of contributing classifiers. The method harnesses reliability indicators-variables that provide signals about the performance of classifiers in different situations. We provide background, present procedures for building metaclassifiers that take into consideration both reliability indicators and classifier outputs, and review a set of comparative studies undertaken to evaluate the methodology.}, } @inProceedings{Bickel04, author = {Steffen Bickel and Tobias Scheffer}, title = {Learning from message pairs for automatic email answering}, booktitle = {Proceedings of ECML-04, 15th European Conference on Machine Learning}, editor = {Jean-Fran{\c{c}}ois Boulicaut and Floriana Esposito and Fosca Giannotti and Dino Pedreschi}, address = {Pisa, IT}, pages = {87--98}, year = {2004}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 3201}, url = {}, abstract = {}, } @inProceedings{Biebricher88, author = {Peter Biebricher and Norbert Fuhr and Gerhard Knorz and Gerhard Lustig and Michael Schwantner}, title = {The automatic indexing system {AIR/PHYS}. From research to application}, booktitle = {Proceedings of SIGIR-88, 11th ACM International Conference on Research and Development in Information Retrieval}, editor = {Yves Chiaramella}, publisher = {ACM Press, New York, US}, address = {Grenoble, FR}, pages = {333--342}, year = {1988}, note = {Reprinted in Karen Sparck Jones and Peter Willett (eds.), ``Readings in Information Retrieval'', Morgan Kaufmann, San Francisco, US, 1997, pp.\ 513--517.}, url = {http://www.acm.org/pubs/articles/proceedings/ir/62437/p333-biebricher/p333-biebricher.pdf}, abstract = {Since October 1985, the automatic indexing system AIR/PHYS has been used in the input production of the physics data base of the Fachinformationszentrum Karlsruhe/West Germany. The texts to be indexed are abstracts written in English. The system of descriptors is prescribed. For the application of the AIR/PHYS system a large-scale dictionary containing more than 600000 word-descriptor relations resp. phrase-descriptor relations has been developed. Most of these relations have been obtained by means of statistical and heuristical methods. In consequence, the relation system is rather imperfect. Therefore, the indexing system needs some fault-tolerating features. An appropriate indexing approach and the corresponding structure of the AIR/PHYS system are described. Finally, the conditions of the application as well as problems of further development are discussed.}, } @inProceedings{Bigi03, author = {Brigitte Bigi}, title = {Using Kullback-Leibler distance for text categorization}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {305--319}, url = {http://link.springer.de/link/service/series/0558/papers/2633/26330305.pdf}, abstract = {A system that performs text categorization aims to assign appropriate categories from a predefined classification scheme to incoming documents. These assignments might be used for varied purposes such as filtering, or retrieval. This paper introduces a new effective model for text categorization with great corpus (more or less 1 million documents). Text categorization is performed using the Kullback-Leibler distance between the probability distribution of the document to classify and the probability distribution of each category. Using the same representation of categories, experiments show a significant improvement when the above mentioned method is used. KLD method achieve substantial improvements over the tfidf performing method.}, } @article{Blei03, author = {David M. Blei and Andrew Y. Ng and Michael I. Jordan}, title = {Latent Dirichlet Allocation}, journal = {Journal of Machine Learning Research}, volume = {3}, pages = {993--1022}, year = {2003}, url = {http://www.ai.mit.edu/projects/jmlr/papers/volume3/blei03a/blei03a.pdf}, abstract = {We describe latent Dirichlet allocation (LDA), a generative probabilistic model for collections of discrete data such as text corpora. LDA is a three-level hierarchical Bayesian model, in which each item of a collection is modeled as a finite mixture over an underlying set of topics. Each topic is, in turn, modeled as an infinite mixture over an underlying set of topic probabilities. In the context of text modeling, the topic probabilities provide an explicit representation of a document. We present efficient approximate inference techniques based on variational methods and an EM algorithm for empirical Bayes parameter estimation. We report results in document modeling, text classification, and collaborative filtering, comparing to a mixture of unigrams model and the probabilistic LSI model.}, } @article{Bloedorn98, author = {Eric Bloedorn and Ryszard S. Michalski}, title = {Data-Driven Constructive Induction}, journal = {IEEE Intelligent Systems}, year = {1998}, number = {2}, volume = {13}, pages = {30--37}, url = {}, abstract = {An inductive learning program's ability to find an accurate hypothesis can depend on the quality of the representation space. The authors developed a data-driven constructive-induction method that uses multiple operators to improve the representation space. They applied it to two real-world problems.}, } @inProceedings{Blosseville92, author = {M.J. Blosseville and Georges Hebrail and M.G. Montell and N. Penot}, title = {Automatic document classification: natural langage processing and expert system techniques used together}, booktitle = {Proceedings of SIGIR-92, 15th ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark Pejtersen}, publisher = {ACM Press, New York, US}, address = {Kobenhavn, DK}, pages = {51--57}, year = {1992}, url = {http://www.acm.org/pubs/articles/proceedings/ir/133160/p51-blosseville/p51-blosseville.pdf}, abstract = {In this paper we describe an automated method of classifying research project descriptions: a human expert classifies a sample set of projects into a set of disjoint and pre-defined classes, and then the computer learns from this sample how to classify new projects into these classes. Both textual and non-textual information associated with the projects are used in the learning and classification phases. Textual information is processed by two methods of analysis: a natural language analysis followed by a statistical analysis. Non-textual information is processed by a symbolic learning technique. We present the results of some experiments done on real data: two different classifications of our research projects.}, } @article{Borko63, author = {Harold Borko and Myrna Bernick}, title = {Automatic document classification}, journal = {Journal of the Association for Computing Machinery}, year = {1963}, volume = {10}, number = {2}, pages = {151--161}, url = {http://www.acm.org/pubs/articles/journals/jacm/1963-10-2/p151-borko/p151-borko.pdf}, } @article{Borko64, author = {Harold Borko and Myrna Bernick}, title = {Automatic document classification. Part II: additional experiments}, journal = {Journal of the Association for Computing Machinery}, year = {1964}, volume = {11}, number = {2}, pages = {138--151}, url = {http://www.acm.org/pubs/articles/journals/jacm/1964-11-2/p138-borko/p138-borko.pdf}, abstract = {This study reports the results of a series of experiments in the techniques of automatic document classifications. Two different classification schedules are compared along with two methods of automatically classifying documents into categories. It is concluded that, while there is no significant difference in the predictive efficiency between the Bayesian and the Factor Score methods, automatic document classification is enhanced by the use of a factor-analytically-derived classification schedule. Approximately 55 percent of the documents were automatedly and correctly classified.}, } @inProceedings{Brank02a, author = {Janez Brank and Marko Grobelnik and Natasa Mili{\'{c}}-Frayling and Dunja Mladeni{\'{c}}}, title = {Feature selection using support vector machines}, booktitle = {Proceedings of the 3rd International Conference on Data Mining Methods and Databases for Engineering, Finance, and Other Fields}, year = {2002}, pages = {}, address = {Bologna, IT}, url = {http://www.brank.org/msr/FsNormal/Bologna/bologna-paper-4.pdf}, abstract = {Text categorization is the task of classifying natural language documents into a set of predefined categories. Documents are typically represented by sparse vectors under the vector space model, where each word in the vocabulary is mapped to one coordinate axis and its occurrence in the document gives rise to one nonzero component in the vector representing that document. When training classifiers on large collections of documents, both the time and memory requirements connected with processing of these vectors may be prohibitive. This calls for using a feature selection method, not only to reduce the number of features but also to increase the sparsity of document vectors. We propose a feature selection method based on linear Support Vector Machines (SVMs). First, we train the linear SVM on a subset of training data and retain only those features that correspond to highly weighted components (in absolute value sense) of the normal to the resulting hyperplane that separates positive and negative examples. This reduced feature space is then used to train a classifier over a larger training set because more documents now fit into the same amount of memory. In our experiments we compare the effectiveness of the SVM -based feature selection with that of more traditional feature selection methods, such as odds ratio and information gain, in achieving the desired tradeoff between the vector sparsity and the classification performance. Experimental results indicate that, at the same level of vector sparsity, feature selection based on SVM normals yields better classification performance than odds ratio- or information gainbased feature selection when linear SVM classifiers are used.}, } @inProceedings{Bruckner97, author = {T. Bruckner}, title = {The text categorization system TEKLIS at {TREC-6}}, booktitle = {Proceedings of TREC-6, 6th Text Retrieval Conference}, publisher = {National Institute of Standards and Technology, Gaithersburg, US}, editor = {Ellen M. Voorhees and Donna K. Harman}, year = {1997}, address = {Gaithersburg, US}, pages = {619--621}, url = {http://trec.nist.gov/pubs/trec6/papers/siemens.ps.gz}, abstract = {The article documents the author's participation in the filtering and routing tasks of TREC-6 with the commercial filtering system TEKLIS. TEKLIS is a training based statistical categorization system which incorporates shallow linguistic processing and fuzzy set methods. The author presents the core technology of TEKLIS, the results on the filtering and routing tasks and a discussion of the insights gained through participation in the exercise.}, } @inProceedings{Cai03, author = {Lijuan Cai and Thomas Hofmann}, title = {Text categorization by boosting automatically extracted concepts}, booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on Research and Development in Information Retrieval}, editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David Hawking and Alan Smeaton}, publisher = {ACM Press, New York, US}, address = {Toronto, CA}, year = {2003}, pages = {182--189}, url = {http://doi.acm.org/10.1145/860435.860470}, abstract = {Term-based representations of documents have found wide-spread use in information retrieval. However, one of the main shortcomings of such methods is that they largely disregard lexical semantics and, as a consequence, are not sufficiently robust with respect to variations in word usage. In this paper we investigate the use of concept-based document representations to supplement word- or phrase-based features. The utilized concepts are automatically extracted from documents via probabilistic latent semantic analysis. We propose to use AdaBoost to optimally combine weak hypotheses based on both types of features. Experimental results on standard benchmarks confirm the validity of our approach, showing that AdaBoost achieves consistent improvements by including additional semantic features in the learned ensemble.}, } @inProceedings{Cai04, author = {Lijuan Cai and Thomas Hofmann}, title = {Hierarchical Document Categorization with Support Vector Machines}, booktitle = {Proceedings of CIKM-04, 13th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, address = {Washington, US}, editor = {David A. Evans and Luis Gravano and Otthein Herzog and ChengXiang Zhai and Marc Ronthaler}, year = {2004}, pages = {78--87}, url = {}, abstract = {}, } @inProceedings{Calado03, author = {P{\'{a}}vel Calado and Marco Cristo and Edleno Silva De Moura and Nivio Ziviani and Berthier A. Ribeiro-Neto and Marcos Andr{\'{e}} Gon{\c{c}}alves}, title = {Combining link-based and content-based methods for Web document classification}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {394--401}, url = {http://doi.acm.org/10.1145/956863.956938}, abstract = {This paper studies how link information can be used to improve classification results for Web collections. We evaluate four different measures of subject similarity, derived from the Web link structure, and determine how accurate they are in predicting document categories. Using a Bayesian network model, we combine these measures with the results obtained by traditional content-based classifiers. Experiments on a Web directory show that best results are achieved when links from pages outside the directory are considered. Link information alone is able to obtain gains of up to 46 points in F1, when compared to a traditional content-based classifier. The combination with content-based methods can further improve the results, but too much noise may be introduced, since the text of Web pages is a much less reliable source of information. This work provides an important insight on which measures derived from links are more appropriate to compare Web documents and how these measures can be combined with content-based algorithms to improve the effectiveness of Web classification.}, } @inProceedings{Caldon03, author = {Patrick Caldon}, title = {Using Text Classification to Predict the Gene Knockout Behaviour of {S.\ Cerevisiae}}, booktitle = {Proceedings of APBC-03, 1st Asia-Pacific Bioinformatics Conference}, editor = {Yi-Ping P. Chen}, publisher = {Australian Computer Society}, address = {Adelaide, AU}, year = {2003}, pages = {211--214}, url = {}, abstract = {}, } @article{Carbonell00, author = {Jaime Carbonell and William W. Cohen and Yiming Yang}, title = {Guest editors' introduction to the special issue on machine learning and information retrieval}, journal = {Machine Learning}, volume = {39}, number = {2/3}, pages = {99--101}, year = {2000}, url = {http://www.wkap.nl/article.pdf?255754}, } @inProceedings{Cardoso03, author = {Ana Cardoso-Cachopo and Arlindo L. Oliveira}, title = {An Empirical Comparison of Text Categorization Methods}, booktitle = {Proceedings of SPIRE-03, 10th International Symposium on String Processing and Information Retrieval}, editor = {Mario A. Nascimento and Edleno S. De Moura and Arlindo L. Oliveira}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2857}, year = {2003}, address = {Manaus, BR}, pages = {183--196}, url = {http://www.gia.ist.utl.pt/~acardoso/spire03.pdf}, abstract = {In this paper we present a comprehensive comparison of the performance of a number of text categorization methods in two different data sets. In particular, we evaluate the Vector and Latent Semantic Analysis (LSA) methods, a classifier based on Support Vector Machines (SVM) and the k-Nearest Neighbor variations of the Vector and LSA models. We report the results obtained using the Mean Reciprocal Rank as a measure of overall performance, a commonly used evaluation measure for question answering tasks. We argue that this evaluation measure is also very well suited for text categorization tasks. Our results show that overall, SVMs and k-NN LSA perform better than the other methods, in a statistically significant way.}, } @inCollection{Caropreso01, author = {Maria Fernanda Caropreso and Stan Matwin and Fabrizio Sebastiani}, title = {A learner-independent evaluation of the usefulness of statistical phrases for automated text categorization}, year = {2001}, booktitle = {Text Databases and Document Management: Theory and Practice}, editor = {Amita G. Chin}, publisher = {Idea Group Publishing}, address = {Hershey, US}, pages = {78--102}, url = {http://www.math.unipd.it/~fabseb60/Publications/TD01a.pdf}, abstract = {In this work we investigate the usefulness of {\em $n$-grams} for document indexing in text categorization (TC). We call $n$-gram a set $g_k$ of $n$ word stems, and we say that $g_k$ occurs in a document $d_j$ when a sequence of words appears in $d_j$ that, after stop word removal and stemming, consists exactly of the $n$ stems in $g_k$, in some order. Previous researches have investigated the use of $n$-grams (or some variant of them) in the context of specific learning algorithms, and thus have not obtained general answers on their usefulness for TC. In this work we investigate the usefulness of $n$-grams in TC independently of any specific learning algorithm. We do so by applying feature selection to the pool of all $k$-grams ($k\leq n$), and checking how many $n$-grams score high enough to be selected in the top $\sigma$ $k$-grams. We report the results of our experiments, using various feature selection measures and varying values of $\sigma$, performed on the {\sc Reuters-21578} standard TC benchmark. We also report results of making actual use of the selected $n$-grams in the context of a linear classifier induced by means of the Rocchio method.}, } @inProceedings{Carreras01, author = {Xavier Carreras and Llu\'{\i}s M\'arquez}, title = {Boosting Trees for Anti-Spam Email Filtering}, year = {2001}, editor = {}, booktitle = {Proceedings of RANLP-01, 4th International Conference on Recent Advances in Natural Language Processing}, address = {Tzigov Chark, BG}, pages = {}, url = {http://www.lsi.upc.es/~carreras/pub/boospam.ps}, } @inProceedings{Cavnar94, author = {William B. Cavnar and John M. Trenkle}, title = {N-Gram-Based Text Categorization}, booktitle = {Proceedings of SDAIR-94, 3rd Annual Symposium on Document Analysis and Information Retrieval}, publisher = {}, editor = {}, year = {1994}, address = {Las Vegas, US}, pages = {161--175}, url = {http://www.nonlineardynamics.com/trenkle/papers/sdair-94-bc.ps.gz}, abstract = {Text categorization is a fundamental task in doc-ument processing, allowing the automated handling of enormous streams of documents in electronic form. One difficulty in handling some classes of documents is the presence of different kinds of textual errors, such as spelling and grammatical errors in email, and character recognition errors in documents that come through OCR. Text categorization must work reliably on all input, and thus must tolerate some level of these kinds of problems. We describe here an N-gram-based approach to text categorization that is tolerant of textual errors. The system is small, fast and robust. This system worked very well for language classification, achieving in one test a 99.8\% correct classification rate on Usenet newsgroup articles written in different languages. The system also worked reasonably well for classifying articles from a number of different computer-oriented newsgroups according to subject, achieving as high as an 80\% correct classification rate. There are also several obvious directions for improving the system's classification performance in those cases where it did not do as well. The system is based on calculating and comparing profiles of N-gram frequencies. First, we use the system to compute profiles on training set data that represent the various categories, e.g., language samples or newsgroup content samples. Then the system computes a profile for a particular document that is to be classified. Finally, the system computes a distance measure between the document's profile and each of the category profiles. The system selects the category whose profile has the smallest distance to the document's profile. The profiles involved are quite small, typically 10K bytes for a category training set, and less than 4K bytes for an individual document. Using N-gram frequency profiles provides a simple and reliable way to categorize documents in a wide range of classification tasks.}, } @inProceedings{Ceci03, author = {Michelangelo Ceci and Donato Malerba}, title = {Hierarchical Classification of HTML Documents with {WebClassII}}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {57--72}, url = {http://link.springer.de/link/service/series/0558/papers/2633/26330057.pdf}, abstract = {This paper describes a new method for the classification of a HTML document into a hierarchy of categories. The hierarchy of categories is involved in all phases of automated document classification, namely feature extraction, learning, and classification of a new document. The innovative aspects of this work are the feature selection process, the automated threshold determination for classification scores, and an experimental study on real-word Web documents that can be associated to any node in the hierarchy. Moreover, a new measure for the evaluation of system performances has been introduced in order to compare three different techniques (flat, hierarchical with proper training sets, hierarchical with hierarchical training sets). The method has been implemented in the context of a client-server application, named WebClassII. Results show that for hierarchical techniques it is better to use hierarchical training sets.}, } @inProceedings{Cerny83, author = {Barbara A. Cerny and Anna Okseniuk and J. Dennis Lawrence}, title = {A fuzzy measure of agreement between machine and manual assignment of documents to subject categories}, booktitle = {Proceedings of ASIS-83, 46th Annual Meeting of the American Society for Information Science}, publisher = {American Society for Information Science, Washington, US}, editor = {Raymond F. Vondran and Anne Caputo and Carol Wasserman and Richard A. Diener}, year = {1983}, address = {Washington, US}, pages = {265}, url = {}, } @inProceedings{Chai02, author = {Kian M. Chai and Hwee T. Ng and Hai L. Chieu}, title = {Bayesian online classifiers for text classification and filtering}, booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on Research and Development in Information Retrieval}, editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng and Kalervo J{\"{a}}rvelin}, publisher = {ACM Press, New York, US}, address = {Tampere, FI}, year = {2002}, pages = {97--104}, url = {http://doi.acm.org/10.1145/564376.564395}, abstract = {This paper explores the use of Bayesian online classifiers to classify text documents. Empirical results indicate that these classifiers are comparable with the best text classification systems. Furthermore, the online approach offers the advantage of continuous learning in the batch-adaptive text filtering task.}, } @inProceedings{Chakrabarti02, author = {Soumen Chakrabarti and Shourya Roy and Mahesh Soundalgekar}, title = {Fast and accurate text classification via multiple linear discriminant projections}, booktitle = {Proceedings of VLDB-02, 28th International Conference on Very Large Data Bases}, publisher = {}, editor = {}, year = {2002}, address = {Hong Kong, CN}, pages = {658--669}, url = {http://www.vldb.org/conf/2002/S19P01.pdf}, abstract = {Support vector machines (SVMs) have shown superb performance for text classification tasks. They are accurate, robust, and quick to apply to test instances. Their only potential drawback is their training time and memory requirement. For n training instances held in memory, the best-known SVM implementations take time proportional to n a , where a is typically between 1.8 and 2.1. SVMs have been trained on data sets with several thousand instances, but Web directories today contain millions of instances which are valuable for mapping billions of Web pages into Yahoo!-like directories. We present SIMPL, a nearly linear-time classification algorithm which mimics the strengths of SVMs while avoiding the training bottleneck. It uses Fisher's linear discriminant, a classical tool from statistical pattern recognition, to project training instances to a carefully selected low-dimensional subspace before inducing a decision tree on the projected instances. SIMPL uses efficient sequential scans and sorts, and is comparable in speed and memory scalability to widely-used naive Bayes (NB) classifiers, but it beats NB accuracy decisively. It not only approaches and sometimes exceeds SVM accuracy, but also beats SVM running time by orders of magnitude. While developing SIMPL, we also make a detailed experimental analysis of the cache performance of SVMs.}, } @inProceedings{Chakrabarti97, author = {Soumen Chakrabarti and Byron E. Dom and Rakesh Agrawal and Prabhakar Raghavan}, title = {Using taxonomy, discriminants, and signatures for navigating in text databases}, booktitle = {Proceedings of VLDB-97, 23rd International Conference on Very Large Data Bases}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, editor = {Matthias Jarke and Michael J. Carey and Klaus R. Dittrich and Frederick H. Lochovsky and Pericles Loucopoulos and Manfred A. Jeusfeld}, year = {1997}, address = {Athens, GR}, pages = {446--455}, url = {http://www.vldb.org/conf/1997/P446.PDF}, note = {An extended version appears as~\cite{Chakrabarti98c}}, abstract = {We explore how to organize a text database hierarchically to aid better searching and browsing. We propose to exploit the natural hierarchy of topics, or taxonomy, that many corpora, such as internet directories, digital libraries, and patent databases enjoy. In our system, the user navigates through the query response not as a flat unstructured list, but embedded in the familiar taxonomy, and annotated with document signatures computed dynamically with respect to where the user is located at any time. We show how to update such databases with new documents with high speed and accuracy. We use techniques from statistical pattern recognition to efficiently separate the feature words or discriminants from the noise words at each node of the taxonomy. Using these, we build a multi-level classifier. At each node, this classifier can ignore the large number of noise words in a document. Thus the classifier has a small model size and is very fast. However, owing to the use of context-sensitive features, the classifier is very accurate. We report on experiences with the Reuters newswire benchmark, the US Patent database, and web document samples from {{\sc Yahoo!}}\.}, } @inProceedings{Chakrabarti98b, author = {Soumen Chakrabarti and Byron E. Dom and Piotr Indyk}, title = {Enhanced hypertext categorization using hyperlinks}, booktitle = {Proceedings of SIGMOD-98, ACM International Conference on Management of Data}, editor = {Laura M. Haas and Ashutosh Tiwary}, publisher = {ACM Press, New York, US}, address = {Seattle, US}, year = {1998}, pages = {307--318}, url = {http://www.acm.org/pubs/articles/proceedings/mod/276304/p307-chakrabarti/p307-chakrabarti.pdf}, abstract = {A major challenge in indexing unstructured hypertext databases is to automatically extract meta-data that enables structured searching using topic taxonomies, circumvents keyword ambiguity and improves the quality of searching and profile-based routing and filtering. Therefore, an accurate classifier is an essential component of a hypertext database. Hyperlinks pose new problems not addressed in the extensive text classification literature. Links clearly contain high-quality semantic clues that are lost upon a purely term-based classifier, but exploiting link information is non-trivial because it is noisy. Naive use of terms in the link neighborhood of a document can even degrade accuracy. Our contribution is to propose robust statistical models and a relaxation labeling technique for better classification by exploiting link information in a small neighborhood around documents. Our technique also adapts gracefully to the fraction of neighboring documents having known topics. We experimented with pre-classified samples from {{\sc Yahoo!}}\ and the US Patent Database. We have developed a text classifier that misclassifies only 13\% of the documents in the Reuters benchmark; this is comparable to the best results ever obtained. Our new classifier misclassified 36\% of the patents, indicating that classifying hypertext can be more difficult than classifying text. Naively using terms in neighboring documents increased the error to 38\%; our hypertext classifier reduced it to 21\%. Results with the Yahoo! sample were more dramatic: the text classifier showed a 68\% error, whereas our hypertext classifier reduced this to just 21\%.}, } @article{Chakrabarti98c, author = {Soumen Chakrabarti and Byron E. Dom and Rakesh Agrawal and Prabhakar Raghavan}, title = {Scalable feature selection, classification and signature generation for organizing large text databases into hierarchical topic taxonomies}, journal = {Journal of Very Large Data Bases}, year = {1998}, number = {3}, volume = {7}, pages = {163--178}, url = {http://www.cs.berkeley.edu/~soumen/VLDB54_3.PDF}, abstract = {We explore how to organize large text databases hierarchically by topic to aid better searching, browsing and filtering. Many corpora, such as internet directories, digital libraries, and patent databases are manually organized into topic hierarchies, also called taxonomies. Similar to indices for relational data, taxonomies make search and access more efficient. However, the exponential growth in the volume of on-line textual information makes it nearly impossible to maintain such taxonomic organization for large, fast-changing corpora by hand. We describe an automatic system that starts with a small sample of the corpus in which topics have been assigned by hand, and then updates the database with new documents as the corpus grows, assigning topics to these new documents with high speed and accuracy. To do this, we use techniques from statistical pattern recognition to efficiently separate the feature words, or discriminants, from thenoise words at each node of the taxonomy. Using these, we build a multilevel classifier. At each node, this classifier can ignore the large number of ``noise'' words in a document. Thus, the classifier has a small model size and is very fast. Owing to the use of context-sensitive features, the classifier is very accurate. As a by-product, we can compute for each document a set of terms that occur significantly more often in it than in the classes to which it belongs. We describe the design and implementation of our system, stressing how to exploit standard, efficient relational operations like sorts and joins. We report on experiences with the Reuters newswire benchmark, the US patent database, and web document samples from Yahoo!. We discuss applications where our system can improve searching and filtering capabilities.}, } @article{Chakrabarti99, author = {Soumen Chakrabarti and Byron E. Dom and S. Ravi Kumar and Prabhakar Raghavan and Sridhar Rajagopalan and Andrew Tomkins and David Gibson and Jon Kleinberg}, title = {Mining the Web's link structure}, journal = {IEEE Computer}, year = {1999}, number = {8}, volume = {32}, pages = {60--67}, url = {http://dlib.computer.org/co/books/co1999/pdf/r8060.pdf}, abstract = {The Web is a hypertext body of approximately 300 million pages that continues to grow at roughly a million pages per day. Page variation is more prodigious than the data's raw scale: Taken as a whole, the set of Web pages lacks a unifying structure and shows far more authoring style and content variation than that seen in traditional text-document collections. This level of complexity makes an ``off-the-shelf'' database-management and information-retrieval solution impossible. To date, index-based search engines for the Web have been the primary tool by which users search for information. Such engines can build giant indices that let you quickly retrieve the set of all Web pages containing a given word or string. Experienced users can make effective use of such engines for tasks that can be solved by searching for tightly constrained keywords and phrases. These search engines are, however, unsuited for a wide range of equally important tasks. In particular, a topic of any breadth will typically contain several thousand or million relevant Web pages. How then, from this sea of pages, should a search engine select the correct ones-those of most value to the user?}, } @inProceedings{Chandrinos00, author = {Konstantinos V. Chandrinos and Ion Androutsopoulos and Georgios Paliouras and Constantine D. Spyropoulos}, title = {Automatic Web Rating: Filtering Obscene Content on the Web}, booktitle = {Proceedings of ECDL-00, 4th European Conference on Research and Advanced Technology for Digital Libraries}, editor = {Jos{\'e} L. Borbinha and Thomas Baker}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1923}, year = {2000}, address = {Lisbon, PT}, pages = {403--406}, url = {http://link.springer.de/link/service/series/0558/papers/1923/19230403.pdf}, abstract = {We present a method to detect automatically pornographic content on the Web. Our method combines techniques from language engineering and image analysis within a machine-learning framework. Experimental results show that it achieves nearly perfect performance on a set of hard cases.}, } @inProceedings{Chen00, author = {Hao Chen and Susan T. Dumais}, title = {Bringing order to the Web: automatically categorizing search results}, booktitle = {Proceedings of CHI-00, ACM International Conference on Human Factors in Computing Systems}, publisher = {ACM Press, New York, US}, editor = {}, year = {2000}, address = {Den Haag, NL}, pages = {145--152}, url = {http://www.acm.org/pubs/articles/proceedings/chi/332040/p145-chen/p145-chen.pdf}, abstract = {We developed a user interface that organizes Web search results into hierarchical categories. Text classification algorithms were used to automatically classify arbitrary search results into an existing category structure on-the-fly. A user study compared our new category interface with the typical ranked list interface of search results. The study showed that the category interface is superior both in objective and subjective measures. Subjects liked the category interface much better than the list interface, and they were 50\% faster at finding information that was organized into categories. Organizing search results allows users to focus on items in categories of interest rather than having to browse through all the results sequentially.}, } @inProceedings{Chen00a, author = {Hao Chen and Tin Kam Ho}, title = {Evaluation of Decision Forests on Text Categorization}, booktitle = {Proceedings of the 7th SPIE Conference on Document Recognition and Retrieval}, publisher = {SPIE {}-{} The International Society for Optical Engineering}, editor = {Daniel P. Lopresti and Jiangying Zhou}, year = {2000}, address = {San Jose, US}, pages = {191--199}, url = {http://cm.bell-labs.com/who/tkh/papers/textcat.ps.gz}, abstract = {Text categorization is useful for indexing documents for information retrieval, filtering parts for document understanding, and summarizing contents of documents of special interests. We describe a text categorization task and an experiment using documents from the Reuters and OHSUMED collections. We applied the Decision Forest classifier and compared its accuracies to those of C4.5 and kNN classifiers, using both category dependent and category independent term selection schemes. It is found that Decision Forest outperforms both C4.5 and kNN in all cases, and that category dependent term selection yields better accuracies. Performances of all three classifiers degrade from the Reuters collection to the OHSUMED collection, but Decision Forest remains to be superior.}, } @inProceedings{Chen01, author = {Chien Chin Chen and Chang Chen, Meng and Yeali Sun}, title = {PVA: A Self-Adaptive Personal View Agent}, booktitle = {Proceedings of KDD-01, 7th ACM SIGKDD International Conferece on Knowledge Discovery and Data Mining}, editor = {Foster Provost and Ramakrishnan Srikant}, year = {2001}, pages = {257--262}, publisher = {ACM Press, New York, US}, address = {San Francisco, US}, url = {http://doi.acm.org/10.1145/502512.502548}, abstract = {In this paper, we present PVA, an adaptive personal view information agent system to track, learn and manage, user's interests in Internet documents. When user's interests change, PVA, in not only the contents, but also in the structure of user profile, is modified to adapt to the changes. Experimental results show that modulating the structure of user profile does increase the accuracy of personalization systems.}, } @article{Chen02, author = {Chien Chin Chen and Chang Chen, Meng and Yeali Sun}, title = {PVA: A Self-Adaptive Personal View Agent}, journal = {Journal of Intelligent Information Systems}, year = {2002}, note = {Special Issue on Automated Text Categorization}, volume = {18}, number = {2/3}, pages = {173--194}, url = {http://www.wkap.nl/article.pdf?391245}, abstract = {In this paper, we present PVA, an adaptive personal view information agent system for tracking, learning and managing user interests in Internet documents. PVA consists of three parts: a {\it proxy}, {\it personal view constructor}, and {\it personal view maintainer}. The proxy logs the user's activities and extracts the user's interests without user intervention. The personal view constructor mines user interests and maps them to a class hierarchy (i.e., personal view). The personal view maintainer synchronizes user interests and the personal view periodically. When user interests change, in PVA, not only the contents, but also the structure of the user profile are modified to adapt to the changes. In addition, PVA considers the aging problem of user interests. The experimental results show that modulating the structure of the user profile increases the accuracy of a personalization system.}, } @article{Chen03, author = {Chen L. and Tokuda N. and Nagai A.}, title = {A new differential LSI space-based probabilistic document classifier}, journal = {Information Processing Letters}, pages = {203--212}, year = {2003}, volume = {88}, number = {5}, doi = {http://dx.doi.org/10.1016/j.ipl.2003.09.002}, abstract = {We have developed a new effective probabilistic classifier for document classification by introducing the concept of differential document vectors and DLSI (differential latent semantic indexing) spaces. A combined use of the projections on and the distances to the DLSI spaces introduced from the differential document vectors improves the adaptability of the LSI (latent semantic indexing) method by capturing unique characteristics of documents. Using the intra- and extra-document statistics, both a simple posteriori calculation on a small example and an experiment on a large Reuters-21578 database demonstrate the advantage of the DLSI space-based probabilistic classifier over the LSI space-based classifier in classification performance.}, } @inProceedings{Chen04, author = {Wenliang Chen and Jingbo Zhu and Honglin Wu and Yao Tianshun}, title = {Automatic Learning Features Using Bootstrapping for Text Categorization}, booktitle = {Proceedings of CICLING-04, 5th International Conference on Computational Linguistics and Intelligent Text Processing}, year = {2004}, editor = {Alexander F. Gelbukh}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Seoul, KO}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2945}, pages = {571--579}, url = {}, abstract = {}, } @inProceedings{Cheng01, author = {Cheng, Chun-Hung and Jian Tang and Ada Wai-Chee and Irwin King}, title = {Hierarchical Classification of Documents with Error Control}, booktitle = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on Knowledge Discovery and Data Mining}, editor = {David Cheung and Qing Li and Graham Williams}, year = {2001}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Hong Kong, CN}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2035}, pages = {433--443}, url = {http://link.springer-ny.com/link/service/series/0558/papers/2035/20350433.pdf}, abstract = {Classification is a function that matches a new object with one of the predefined classes. Document classification is characterized by the large number of attributes involved in the objects (documents). The traditional method of building a single classifier to do all the classification work would incur a high overhead. Hierarchical classifi- cation is a more efficient method - instead of a single classifier, we use a set of classifiers distributed over a class taxonomy, one for each internal node. However, once a misclassification occurs at a high level class, it may result in a class that is far apart from the correct one. An existing approach to coping with this problem requires terms also to be arranged hierarchically. In this paper, instead of overhauling the classifier itself, we propose mechanisms to detect misclassification and take appropriate actions. We then discuss an alternative that masks the misclassification based on a well known software fault tolerance technique. Our experiments show our algorithms represent a good trade-off between speed and accuracy in most applications.}, } @inProceedings{Cheong02, author = {Cheong Fung, Gabriel P. and Jeffrey X. Yu and Hongjun Lu}, title = {Discriminative Category Matching: Efficient Text Classification for Huge Document Collections}, booktitle = {Proceedings of ICDM-02, 2nd IEEE International Conference on Data Mining}, editor = {}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, address = {Maebashi City, JP}, year = {2002}, pages = {187--194}, url = {http://dlib.computer.org/conferen/icdm/1754/pdf/17540187.pdf}, abstract = {With the rapid growth of textual information available on the Internet, having a good model for classifying and managing documents automatically is undoubtly important. When more documents are archived, new terms, new concepts and concept-drift will frequently appear. Without a doubt, updating the classification model frequently rather than using the old model for a very long period is absolutely essential. Here, the challenges are: a) obtain a high accuracy classification model; b) consume low computational time for both model training and operation; and c) occupy low storage space. However, none of the existing classification approaches could achieve all of these requirements. In this paper, we propose a novel text classification approach, called Discriminative Category Matching, which could achieve all of the stated characteristics. Extensive experiments using two benchmarks and a large real-life collection are conducted. The encouraging results indicated that our approach is hignhly feasible.}, } @article{Chouchoulas01, author = {Alexios Chouchoulas and Qiang Shen}, title = {Rough set-aided keyword reduction for text categorization}, journal = {Applied Artificial Intelligence}, pages = {843--873}, year = {2001}, volume = {15}, number = {9}, url = {}, abstract = {The volume of electronically stored information increases exponentially as the state of the art progresses. Automated information filtering (IF) and information retrieval (IR) systems are therefore acquiring rapidly increasing prominence. However, such systems sacrifice efficiency to boost effectiveness. Such systems typically have to cope with sets of rectors of many tens of thousands of dimensions. Rough set (RS) theory can be applied to reducing the dimensionality of data used in IF/IR tasks, by providing a measure of the information content of datasets with respect to a given classification. This can aid IF/IR systems that rely on the acquisition of large numbers of term weights or other measures of relevance. This article investigates the applicability of RS theory to the IF/IR application domain and compares this applicability with respect to various existing TC techniques. The ability, of the approach to generalize, given a minimum of training data is also addressed. The background of RS theory is presented, with an illustrative example to demonstrate the operation of the RS-based dimensionality reduction. A modular system is proposed which allows the integration of this technique with a large variety of different IF/IR approaches. The example application, categorization of E-mail messages, is described. Systematic experiments and their results are reported and analyzed.}, } @inProceedings{Chuang00, author = {Wesley T. Chuang and Asok Tiyyagura and Jihoon Yang and Giovanni Giuffrida}, title = {A Fast Algorithm for Hierarchical Text Classification}, booktitle = {Proceedings of DaWaK-00, 2nd International Conference on Data Warehousing and Knowledge Discovery}, editor = {Yahiko Kambayashi and Mukesh Mohania and A.Min Tjoa}, year = {2000}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1874}, address = {London, UK}, pages = {409--418}, url = {http://www.cs.iastate.edu/~yang/Papers/dawak00.ps}, abstract = {Text classification is becoming more important with the proliferation of the Internet and the huge amount of data it transfers. We present an efficient algorithm for text classification using hierarchical classifiers based on a concept hierarchy. The simple TFIDF classifier is chosen to train sample data and to classify other new data. Despite its simplicity, results of experiments on Web pages and TV closed captions demonstrate high classification accuracy. Application of feature subset selection techniques improves the performance. Our algorithm is computationally efficient being bounded by O(n log n) forn samples.}, } @inProceedings{Ciravegna99, author = {Fabio Ciravegna and Alberto Lavelli and Nadia Mana and Johannes Matiasek and Luca Gilardoni and Silvia Mazza and William J. Black and Fabio Rinaldi}, title = {FACILE: Classifying Texts Integrating Pattern Matching and Information Extraction}, booktitle = {Proceedings of IJCAI-99, 16th International Joint Conference on Artificial Intelligence}, editor = {Thomas Dean}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = {1999}, pages = {890--895}, address = {Stockholm, SE}, url = {http://ecate.itc.it:1024/lavelli/lavelli-papers/IJCAI99/ijcai99.ps.gz}, abstract = {Successfully managing information means being able to find relevant new information and to correctly integrate it with pre-existing knowledge. Much information is nowadays stored as multilingual textual data; therefore advanced classification systems are currently considered as strategic components for effective knowledge management. We describe an experience integrating different innovative AI technologies such as hierarchical pattern matching and information extraction to provide flexible multilingual classification adaptable to user needs. Pattern matching produces fairly accurate and fast categorisation over a large number of classes, while information extraction provides fine-grained classification for a reduced number of classes. The resulting system was adopted by the main Italian financial news agency providing a pay-to-view service.}, } @inProceedings{Clack97, author = {Chris Clack and Johnny Farringdon and Peter Lidwell and Tina Yu}, title = {Autonomous document classification for business}, editor = {W. Lewis Johnson}, publisher = {ACM Press, New York, US}, booktitle = {Proceedings of the 1st International Conference on Autonomous Agents}, address = {Marina Del Rey, US}, year = {1997}, pages = {201--208}, url = {http://www.acm.org/pubs/articles/proceedings/ai/267658/p201-clack/p201-clack.pdf}, abstract = {With the continuing exponential growth of the Internet and the more recent growth of business Intranets, the commercial world is becoming increasingly aware of the problem of electronic information overload. This has encouraged interest in developing agents/softbots that can act as electronic personal assistants and can develop and adapt representations of users information needs, commonly known as profiles. As the result of collaborative research with Friends of the Earth, an environmental issues campaigning organisation, we have developed a general purpose information classification agent architecture and have applied it to the problem of document classification and routing. Collaboration with Friends of the Earth allows us to test our ideas in a non-academic context involving high volumes of documents. We use the technique of genetic programming (GP), (Koza and Rice 1992), to evolve classifying agents. This is a novel approach for document classification, where each agent evolves a parse-tree representation of a user's particular information need. The other unusual features of our research are the longevity of our agents and the fact that they undergo a continual training process; feedback from the user enables the agent to adapt to the user's long-term information requirements.}, } @inProceedings{Cohen95, author = {William W. Cohen}, title = {Text categorization and relational learning}, booktitle = {Proceedings of ICML-95, 12th International Conference on Machine Learning}, editor = {Armand Prieditis and Stuart J. Russell}, address = {Lake Tahoe, US}, year = {1995}, pages = {124--132}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.research.whizbang.com/~wcohen/postscript/ml-95-ir.ps}, abstract = {We evaluate the first order learning system FOIL on a series of text categorization problems. It is shown that FOIL usually forms classifiers with lower error rates and higher rates of precision and recall with a relational encoding than with a propositional encoding. We show that FOIL's performance can be improved by relation selection, a first order analog of feature selection. Relation selection improves FOIL's performance as measured by any of recall, precision, F-measure, or error rate. With an appropriate level of relation selection, FOIL appears to be competitive with or superior to existing propositional techniques.}, } @inCollection{Cohen95a, author = {William W. Cohen}, title = {Learning to classify English text with ILP methods}, booktitle = {Advances in inductive logic programming}, editor = {De Raedt, Luc}, publisher = {IOS Press}, address = {Amsterdam, NL}, pages = {124--143}, year = {1995}, url = {http://www.research.whizbang.com/~wcohen/postscript/ilp.ps}, abstract = {Text categorization is the task of classifying text into one of several predefined categories. In this paper we will evaluate the effectiveness of several ILP methods for text categorization, and also compare them to their propositional analogs. The methods considered are FOIL, the propositional rule-learning system RIPPER, and a first-order version of RIPPER called FLIPPER. We show that the benefit of using a first-order representation in this domain is relatively modest; in particular, the performance difference between FLIPPER and FOIL and their propositional counterparts is quite small, compared to the differences between FOIL and FLIPPER. However, a first-order representation seems to be advantageous when high-precision classifiers are desirable.}, } @inProceedings{Cohen96a, author = {William W. Cohen and Yoram Singer}, title = {Context-sensitive learning methods for text categorization}, booktitle = {Proceedings of SIGIR-96, 19th ACM International Conference on Research and Development in Information Retrieval}, editor = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and Ross Wilkinson}, publisher = {ACM Press, New York, US}, year = {1996}, address = {Z{\"{u}}rich, CH}, pages = {307--315}, note = {An extended version appears as~\cite{Cohen99}}, url = {http://www.research.whizbang.com/~wcohen/postscript/sigir-96.ps}, abstract = {Two machine learning algorithms, RIPPER and sleeping experts for phrases, are evaluated on a number of large text categorization problems. These algorithms both construct classifiers that allow the ``context'' of a word w to affect how (or even whether) the presence or absence of w will contribute to a classification. However, RIPPER and sleeping experts differ radically in many other respects. Differences include: different notions as to what constitutes a context; different ways of combining contexts to construct a classifier; different methods to search for a combination of contexts; and different criteria as to what contexts should be included in such a combination. In spite of these differences, both RIPPER and sleeping experts perform extremely well across a wide variety of categorization problems, generally outperforming previously applied learning methods. We view this result as a confirmation of the usefulness of classifiers that represent contextual information.}, } @inProceedings{Cohen98, author = {William W. Cohen and Haym Hirsh}, title = {Joins that generalize: text classification using {{\sc Whirl}}}, booktitle = {Proceedings of KDD-98, 4th International Conference on Knowledge Discovery and Data Mining}, editor = {Rakesh Agrawal and Paul E. Stolorz and Gregory Piatetsky-Shapiro}, publisher = {AAAI Press, Menlo Park, US}, year = {1998}, address = {New York, US}, pages = {169--173}, url = {http://www.research.whizbang.com/~wcohen/postscript/kdd-98.ps}, abstract = {WHIRL is an extension of relational databases that can perform ``soft joins'' based on the similarity of textual identifiers; these soft joins extend the traditional operation of joining tables based on the equivalence of atomic values. This paper evaluates WHIRL on a number of inductive classification tasks using data from the World Wide Web. We show that although WHIRL is designed for more general similarity-based reasoning tasks, it is competitive with mature inductive classification systems on these classification tasks. In particular, WHIRL generally achieves lower generalization error than C4.5, RIPPER, and several nearest-neighbor methods. WHIRL is also fast-up to 500 times faster than C4.5 on some benchmark problems. We also show that WHIRL can be efficiently used to select from a large pool of unlabeled items those that can be classified correctly with high confidence.}, } @article{Cohen99, author = {William W. Cohen and Yoram Singer}, title = {Context-sensitive learning methods for text categorization}, journal = {ACM Transactions on Information Systems}, year = {1999}, volume = {17}, number = {2}, pages = {141--173}, url = {http://www.acm.org/pubs/articles/journals/tois/1999-17-2/p141-cohen/p141-cohen.pdf}, abstract = {Two recently implemented machine-learning algorithms, RIPPER and sleeping-experts for phrases, are evaluated on a number of large text categorization problems. These algorithms both construct classifiers that allow the ``context'' of a word w to affect how (or even whether) the presence or absence of w will contribute to a classification. However, RIPPER and sleeping-experts differ radically in many other respects: differences include different notions as to what constitutes a context, different ways of combining contexts to construct a classifier, different methods to search for a combination of contexts, and different criteria as to what contexts should be included in such a combination. In spite of these differences, both RIPPER and sleeping-experts perform extremely well across a wide variety of categorization problems, generally outperforming previously applied learning methods. We view this result as a confirmation of the usefulness of classifiers that represent contextual information.}, } @inProceedings{Crammer02, author = {Koby Crammer and Yoram Singer}, title = {A New Family of Online Algorithms for Category Ranking}, booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on Research and Development in Information Retrieval}, editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng and Kalervo J{\"{a}}rvelin}, publisher = {ACM Press, New York, US}, address = {Tampere, FI}, year = {2002}, pages = {151--158}, url = {http://doi.acm.org/10.1145/564376.564404}, abstract = {We describe a new family of topic-ranking algorithms for multi-labeled documents. The motivation for the algorithms stems from recent advances in online learning algorithms. The algorithms we present are simple to implement and are time and memory efficient. We evaluate the algorithms on the Reuters-21578 corpus and the new corpus released by Reuters in 2000. On both corpora the algorithms we present outperform adaptations to topic-ranking of Rocchio's algorithm and the Perceptron algorithm. We also outline the formal analysis of the algorithm in the mistake bound model. To our knowledge, this work is the first to report performance results with the entire new Reuters corpus.}, } @article{Craven00, author = {Mark Craven and Dan DiPasquo and Dayne Freitag and Andrew K. McCallum and Tom M. Mitchell and Kamal Nigam and Se{\'{a}}n Slattery}, title = {Learning to Construct Knowledge Bases from the World Wide Web}, journal = {Artificial Intelligence}, volume = {118}, number = {1/2}, year = {2000}, pages = {69--113}, url = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/overview-aij99.ps.gz}, abstract = {The World Wide Web is a vast source of information accessible to computers, but understandable only to humans. The goal of the research described here is to automatically create a computer understandable knowledge base whose content mirrors that of the World Wide Web. Such a knowledge base would enable much more effective retrieval of Web information, and promote new uses of the Web to support knowledge-based inference and problem solving. Our approach is to develop a trainable information extraction system that takes two inputs. The first is an ontology that defines the classes (e.g., company, person, employee, product) and relations (e.g., employed_by, produced_by) of interest when creating the knowledge base. The second is a set of training data consisting of labeled regions of hypertext that represent instances of these classes and relations. Given these inputs, the system learns to extract information from other pages and hyperlinks on the Web. This article describes our general approach, several machine learning algorithms for this task, and promising initial results with a prototype system that has created a knowledge base describing university people, courses, and research projects.}, } @article{Craven01, author = {Craven, Mark and Slattery, Se{\'{a}}n}, title = {Relational learning with statistical predicate invention: Better models for hypertext}, journal = {Machine Learning}, pages = {97--119}, year = {2001}, volume = {43}, number = {1/2}, url = {http://www.wkap.nl/article.pdf?321079}, abstract = {We present a new approach to learning hypertext classifiers that combines a statistical text-learning method with a relational rule learner. This approach is well suited to learning in hypertext domains because its statistical component allows it to characterize text in terms of word frequencies, whereas its relational component is able to describe how neighboring documents are related to each other by hyperlinks that connect them. We evaluate our approach by applying it to tasks that involve learning definitions for (i) classes of pages, (ii) particular relations that exist between pairs of pages, and (iii) locating a particular class of information in the internal structure of pages. Our experiments demonstrate that this new approach is able to learn more accurate classifiers than either of its constituent methods alone.}, } @inProceedings{Craven98, author = {Mark Craven and Dan DiPasquo and Dayne Freitag and Andrew K. McCallum and Tom M. Mitchell and Kamal Nigam and Se{\'{a}}n Slattery}, title = {Learning to extract symbolic knowledge from the World Wide Web}, booktitle = {Proceedings of AAAI-98, 15th Conference of the American Association for Artificial Intelligence}, publisher = {AAAI Press, Menlo Park, US}, year = {1998}, pages = {509--516}, address = {Madison, US}, note = {An extended version appears as~\cite{Craven00}}, url = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/overview-aaai98.ps.gz}, abstract = {The World Wide Web is a vast source of information accessible to computers, but understandable only to humans. The goal of the research described here is to automatically create a computer understandable world wide knowledge base whose content mirrors that of the World Wide Web. Such a knowledge base would enable much more effective retrieval of Web information, and promote new uses of the Web to support knowledge-based inference and problem solving. Our approach is to develop a trainable information extraction system that takes two inputs: an ontology defining the classes and relations of interest, and a set of training data consisting of labeled regions of hypertext representing instances of these classes and relations. Given these inputs, the system learns to extract information from other pages and hyperlinks on the Web. This paper describes our general approach, several machine learning algorithms for this task, and promising initial results with a prototype system.}, } @article{Creecy92, author = {Robert M. Creecy and Brij M. Masand and Stephen J. Smith and David L. Waltz}, title = {Trading MIPS and memory for knowledge engineering: classifying census returns on the Connection Machine}, journal = {Communications of the ACM}, volume = {35}, number = {8}, year = {1992}, pages = {48--63}, url = {http://www.acm.org/pubs/articles/journals/cacm/1992-35-8/p48-creecy/p48-creecy.pdf}, } @inProceedings{Cristianini01, author = {Nello Cristianini and John Shawe-Taylor and Huma Lodhi}, title = {Latent Semantic Kernels}, booktitle = {Proceedings of ICML-01, 18th International Conference on Machine Learning}, editor = {Carla Brodley and Andrea Danyluk}, address = {Williams College, US}, year = {2001}, pages = {66--73}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {}, abstract = {Kernel methods like Support Vector Machines have successfully been used for text categorization. A standard choice of kernel function has been the inner product between the vector-space representation of two documents, in analogy with classical information retrieval (IR) approaches. Latent Semantic Indexing (LSI) has been successfully used for IR purposes, as a technique for capturing semantic relations between terms and inserting them into the similarity measure between two documents. One of its main drawbacks, in IR, is its computational cost. In this paper we describe how the LSI approach can be implemented in a kernel-defined feature space. We provide experimental results demonstrating that the approach can significantly improve performance, and that it does not impair it.} } @inCollection{Cristianini01a, author = {Huma Lodhi and John Shawe-Taylor and Nello Cristianini and Christopher J. Watkins}, title = {Discrete Kernels for Text Categorisation}, booktitle = {Advances in Neural Information Processing Systems}, editor = {Todd K. Leen and Thomas G. Dietterich and Volker Tresp}, volume = {13}, year = {2001}, pages = {563--569}, publisher = {MIT Press, Cambridge, MA}, url = {http://www.support-vector.net/papers/LodhiShawe-TaylorCristianiniWatkins_ps.ps}, abstract = {}, } @article{Cristianini02, author = {Nello Cristianini and John Shawe-Taylor and Huma Lodhi}, title = {Latent Semantic Kernels}, journal = {Journal of Intelligent Information Systems}, year = {2002}, note = {Special Issue on Automated Text Categorization}, volume = {18}, number = {2/3}, pages = {127--152}, url = {http://www.wkap.nl/article.pdf?391243}, abstract = {Kernel methods like Support Vector Machines have successfully been used for text categorization. A standard choice of kernel function has been the inner product between the vector-space representation of two documents, in analogy with classical information retrieval (IR) approaches. Latent Semantic Indexing (LSI) has been successfully used for IR purposes as a technique for capturing semantic relations between terms and inserting them into the similarity measure between two documents. One of its main drawbacks, in IR, is its computational cost. In this paper we describe how the LSI approach can be implemented in a kernel-defined feature space. We provide experimental results demonstrating that the approach can significantly improve performance, and that it does not impair it.}, } @inProceedings{Dagan96, author = {Dagan, Ido and Feldman, Ronen and Hirsh, Haym}, title = {Keyword-based browsing and analysis of large document sets}, booktitle = {Proceedings of SDAIR-96, 5th Annual Symposium on Document Analysis and Information Retrieval}, publisher = {}, editor = {}, year = {1996}, address = {Las Vegas, US}, pages = {191--207}, url = {}, abstract = {Knowledge discovery in databases (KDD) focuses on the computerized exploration of large amounts of data and on the discovery of interesting patterns within them. While most work on KDD has been concerned with structured databases, there has been little work on handling the huge amount of information that is available only in unstructured textual form. The paper describes the KDT system for knowledge discovery in texts. It is built on top of a text-categorization paradigm where text articles are annotated with keywords organized in a hierarchical structure. Knowledge discovery is performed by analyzing the co-occurrence frequencies of keywords from this hierarchy in the various documents. The authors show how this term-frequency approach supports a range of KDD operations, providing a general framework for knowledge discovery and exploration in collections of unstructured text.}, } @inProceedings{Dagan97, author = {Ido Dagan and Yael Karov and Dan Roth}, title = {Mistake-driven learning in text categorization}, booktitle = {Proceedings of EMNLP-97, 2nd Conference on Empirical Methods in Natural Language Processing}, publisher = {Association for Computational Linguistics, Morristown, US}, editor = {Claire Cardie and Ralph Weischedel}, year = {1997}, address = {Providence, US}, pages = {55--63}, url = {http://l2r.cs.uiuc.edu/~danr/Papers/categ.ps.gz}, abstract = {Learning problems in the text processing domain often map the text to a space whose dimensions are the measured features of the text, e.g., its words. Three characteristic properties of this domain are (a) very high dimensionality, (b) both the learned concepts and the instances reside very sparsely in the feature space, and (c) a high variation in the number of active features in an instance. In this work we study three mistake-driven learning algorithms for a typical task of this nature - text categorization. We argue that these algorithms which categorize documents by learning a linear separator in the feature space have a few properties that make them ideal for this domain. We then show that a quantum leap in performance is achieved when we further modify the algorithms to better address some of the specific characteristics of the domain. In particular, we demonstrate (1) how variation in document length can be tolerated by either normalizing feature weights or by using negative weights, (2) the positive effect of applying a threshold range in training, (3) alternatives in considering feature frequency, and (4) the benefits of discarding features while training. Overall, we present an algorithm, a variation of Littlestone's Winnow, which performs significantly better than any other algorithm tested on this task using a similar feature set.}, } @inProceedings{Dalessio00, author = {Stephen D'Alessio and Keitha Murray and Robert Schiaffino and Aaron Kershenbaum}, title = {The effect of using Hierarchical classifiers in Text Categorization}, booktitle = {Proceeding of RIAO-00, 6th International Conference ``Recherche d'Information Assistee par Ordinateur''}, editor = {}, address = {Paris, FR}, year = {2000}, pages = {302--313}, url = {http://www.iona.edu/cs/FacultyPublications/riao2000New.pdf}, abstract = {Given a set of categories, with or without a preexisting hierarchy among them, we consider the problem of assigning documents to one or more of these categories from the point of view of a hierarchy with more or less depth. We can choose to make use of none, part or all of the hierarchical structure to improve the categorization effectiveness and efficiency. It is possible to create additional hierarchy among the categories. We describe a procedure for generating a hierarchy of classifiers that model the hierarchy structure. We report on computational experience using this procedure. We show that judicious use of a hierarchy can significantly improve both the speed and effectiveness of the categorization process. Using the Reuters-21578 corpus, we obtain an improvement in running time of over a factor of three and a 5\% improvement in F-measure.}, } @inProceedings{Dalessio98, author = {Stephen D'Alessio and Keitha Murray and Robert Schiaffino and Aaron Kershenbaum}, title = {Category Levels in Hierarchical Text Categorization}, booktitle = {Proceedings of EMNLP-98, 3rd Conference on Empirical Methods in Natural Language Processing}, year = {1998}, publisher = {Association for Computational Linguistics, Morristown, US}, editor = {}, pages = {}, address = {Granada, ES}, url = {http://www.iona.edu/cs/FacultyPublications/emnlpf.pdf}, abstract = {We consider the problem of assigning level numbers (weights) to hierarchically organized categories during the process of text categorization. These levels control the ability of the categories to attract documents during the categorization process. The levels are adjusted in order to obtain a balance between recall and precision for each category. If a category's recall exceeds its precision, the category is too strong and its level is reduced. Conversely, a category's level is increased to strengthen it if its precision exceeds its recall. The categorization algorithm used is a supervised learning procedure that uses a linear classifier based on the category levels. We are given a set of categories, organized hierarchically. We are also given a training corpus of documents already placed in one or more categories. From these, we extract vocabulary, words that appear with high frequency within a given category, characterizing each subject area. Each node's vocabulary is filtered and its words assigned weights with respect to the specific category. Then, test documents are scanned and categories ranked based on the presence of vocabulary terms. Documents are assigned to categories based on these rankings. We demonstrate that precision and recall can be significantly improved by solving the categorization problem taking hierarchy into account. Specifically, we show that by adjusting the category levels in a principled way, that precision can be significantly improved, from 84\% to 91\%, on the much-studied Reuters-21578 corpus organized in a three-level hierarchy of categories.}, } @article{Damashek95, author = {Marc Damashek}, title = {Gauging Similarity with N-Grams: Language-Independent Categorization of Text}, journal = {Science}, year = {1995}, volume = {267}, number = {5199}, pages = {843--848}, url = {}, abstract = {A language-independent means of gauging topical similarity in unrestricted text is described. The method combines information derived from n-grams (consecutive sequences of n characters) with a simple vector-space technique that makes sorting, categorization, and retrieval feasible in a large multilingual collection of documents. No prior information about document content or language is required. Context, as it applies to document similarity, can be accommodated by a well-defined procedure. When an existing document is used as an exemplar, the completeness and accuracy with which topically related documents are retrieved is comparable to that of the best existing systems. The results of a formal evaluation are discussed, and examples are given using documents in English and Japanese.}, } @article{Damerau04, author = {Fred J. Damerau and Tong Zhang and Sholom M. Weiss and Nitin Indurkhya}, title = {Text categorization for a comprehensive time-dependent benchmark}, journal = {Information Processing and Management}, year = {2004}, volume = {40}, number = {2}, pages = {209--221}, url = {}, abstract = {}, } @article{Dasigi01, author = {Dasigi, Venu and Mann, Reinhold C. and Protopopescu, Vladimir A.}, title = {Information fusion for text classification: an experimental comparison}, journal = {Pattern Recognition}, year = {2001}, volume = {34}, number = {12}, pages = {2413--2425}, url = {}, abstract = {This article reports on our experiments and results on the effectiveness of different feature sets and information fusion from some combinations of them in classifying free text documents into a given number of categories. We use different feature sets and integrate neural network learning into the method. The feature sets are based on the ``latent semantics'' of a reference library - a collection of documents adequately representing the desired concepts. We found that a larger reference library is not necessarily better. Information fusion almost always gives better results than the individual constituent feature sets, with certain combinations doing better than the others.}, } @inProceedings{Davidov04, author = {Dmitry Davidov and Evgeniy Gabrilovich and Shaul Markovitch}, title = {Parameterized Generation of Labeled Datasets for Text Categorization Based on a Hierarchical Directory}, booktitle = {Proceedings of SIGIR-04, 27th ACM International Conference on Research and Development in Information Retrieval}, editor = {Kalervo J{\"{a}}rvelin and James Allan and Peter Bruza and Mark Sanderson}, publisher = {ACM Press, New York, US}, address = {Sheffield, UK}, year = {2004}, pages = {250--257}, url = {http://www.cs.technion.ac.il/~gabr/papers/accio.pdf}, abstract = {Although text categorization is a burgeoning area of IR research, readily available test collections in this field are surprisingly scarce. We describe a methodology and system (named ACCIO) for automatically acquiring labeled datasets for text categorization from the World Wide Web, by capitalizing on the body of knowledge encoded in the structure of existing hierarchical directories such as the Open Directory. We define parameters of categories that make it possible to acquire numerous datasets with desired properties, which in turn allow better control over categorization experiments. In particular, we develop metrics that estimate the difficulty of a dataset by examining the host directory structure. These metrics are shown to be good predictors of categorization accuracy that can be achieved on a dataset, and serve as efficient heuristics for generating datasets subject to user's requirements. A large collection of automatically generated datasets are made available for other researchers to use.}, } @inProceedings{Debole03, author = {Franca Debole and Fabrizio Sebastiani}, title = {Supervised term weighting for automated text categorization}, year = {2003}, booktitle = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing}, address = {Melbourne, US}, publisher = {ACM Press, New York, US}, pages = {784--788}, url = {http://www.math.unipd.it/~fabseb60/Publications/SAC03b.pdf}, note = {An extended version appears as~\cite{Debole04a}}, abstract = {The construction of a text classifier usually involves (i) a phase of \emph{term selection}, in which the most relevant terms for the classification task are identified, (ii) a phase of \emph{term weighting}, in which document weights for the selected terms are computed, and (iii) a phase of \emph{classifier learning}, in which a classifier is generated from the weighted representations of the training documents. This process involves an activity of {\em supervised learning}, in which information on the membership of training documents in categories is used. Traditionally, supervised learning enters only phases (i) and (iii). In this paper we propose instead that learning from training data should also affect phase (ii), i.e.\ that information on the membership of training documents to categories be used to determine term weights. We call this idea \emph{supervised term weighting} (STW). As an example, we propose a number of ``supervised variants'' of $tfidf$ weighting, obtained by replacing the $idf$ function with the function that has been used in phase (i) for term selection. We present experimental results obtained on the standard \textsf{Reuters-21578} benchmark with one classifier learning method (support vector machines), three term selection functions (information gain, chi-square, and gain ratio), and both local and global term selection and weighting.}, } @inProceedings{Debole04, author = {Franca Debole and Fabrizio Sebastiani}, title = {An Analysis of the Relative Difficulty of Reuters-21578 Subsets}, year = {2004}, booktitle = {Proceedings of LREC-04, 4th International Conference on Language Resources and Evaluation}, address = {Lisbon, PT}, pages = {}, url = {http://www.math.unipd.it/~fabseb60/Publications/LREC04.pdf}, abstract = {The existence, public availability, and widespread acceptance of a standard benchmark for a given information retrieval (IR) task are beneficial to research on this task, since they allow different researchers to experimentally compare their own systems by comparing the results they have obtained on this benchmark. The \textsf{Reuters-21578} test collection, together with its earlier variants, has been such a standard benchmark for the text categorization (TC) task throughout the last ten years. However, the benefits that this has brought about have somehow been limited by the fact that different researchers have ``carved'' different subsets out of this collection, and tested their systems on one of these subsets only; systems that have been tested on different \textsf{Reuters-21578} subsets are thus not readily comparable. In this paper we present a systematic, comparative experimental study of the three subsets of \textsf{Reuters-21578} that have been most popular among TC researchers. The results we obtain allow us to determine the relative difficulty of these subsets, thus establishing an indirect means for comparing TC systems that have, or will be, tested on these different subsets.}, note = {Forthcoming}, } @inCollection{Debole04a, author = {Franca Debole and Fabrizio Sebastiani}, title = {Supervised Term Weighting for Automated Text Categorization}, year = {2004}, booktitle = {Text Mining and its Applications}, editor = {Spiros Sirmakessis}, publisher = {Physica-Verlag, Heidelberg, DE}, series = {Number 138 in the ``Studies in Fuzziness and Soft Computing'' series}, pages = {81--98}, url = {http://www.math.unipd.it/~fabseb60/Publications/NEMIS04.pdf}, abstract = {The construction of a text classifier usually involves (i) a phase of \emph{term selection}, in which the most relevant terms for the classification task are identified, (ii) a phase of \emph{term weighting}, in which document weights for the selected terms are computed, and (iii) a phase of \emph{classifier learning}, in which a classifier is generated from the weighted representations of the training documents. This process involves an activity of {\em supervised learning}, in which information on the membership of training documents in categories is used. Traditionally, supervised learning enters only phases (i) and (iii). In this paper we propose instead that learning from the training data should also affect phase (ii), i.e.\ that information on the membership of training documents to categories be used to determine term weights. We call this idea \emph{supervised term weighting} (STW). As an example of STW, we propose a number of ``supervised variants'' of $tfidf$ weighting, obtained by replacing the $idf$ function with the function that has been used in phase (i) for term selection. The use of STW allows the terms that are distributed most differently in the positive and negative examples of the categories of interest to be weighted highest. We present experimental results obtained on the standard \textsf{Reuters-21578} benchmark with three classifier learning methods (Rocchio, $k$-NN, and support vector machines), three term selection functions (information gain, chi-square, and gain ratio), and both local and global term selection and weighting.}, } @inProceedings{Debole04c, author = {Franca Debole and Fabrizio Sebastiani}, title = {An Analysis of the Relative Hardness of Reuters-21578 Subsets}, year = {2004}, booktitle = {Proceedings of LREC-04, 4th International Conference on Language Resources and Evaluation}, address = {Lisbon, PT}, pages = {971--974}, url = {http://www.math.unipd.it/~fabseb60/Publications/LREC04.pdf}, } @article{Debole05, author = {Franca Debole and Fabrizio Sebastiani}, title = {An Analysis of the Relative Hardness of Reuters-21578 Subsets}, journal = {Journal of the American Society for Information Science and Technology}, year = {2004}, volume = {56}, number = {6}, pages = {584--596}, url = {http://www.math.unipd.it/~fabseb60/Publications/JASIST05.pdf}, abstract = {The existence, public availability, and widespread acceptance of a standard benchmark for a given information retrieval (IR) task are beneficial to research on this task, since they allow different researchers to experimentally compare their own systems by comparing the results they have obtained on this benchmark. The \textsf{Reuters-21578} test collection, together with its earlier variants, has been such a standard benchmark for the text categorization (TC) task throughout the last ten years. However, the benefits that this has brought about have somehow been limited by the fact that different researchers have ``carved'' different subsets out of this collection, and tested their systems on one of these subsets only; systems that have been tested on different \textsf{Reuters-21578} subsets are thus not readily comparable. In this paper we present a systematic, comparative experimental study of the three subsets of \textsf{Reuters-21578} that have been most popular among TC researchers. The results we obtain allow us to determine the relative hardness of these subsets, thus establishing an indirect means for comparing TC systems that have, or will be, tested on these different subsets.}, } @inProceedings{deBuenaga97, author = {De Buenaga Rodr{\'{\i}}guez, Manuel and G{\'o}mez-Hidalgo, Jos{\'e} Mar{\'{\i}}a and D{\'{\i}}az-Agudo, Bel{\'e}n}, title = {Using {WordNet} to Complement Training Information in Text Categorization}, booktitle = {Proceedings of RANLP-97, 2nd International Conference on Recent Advances in Natural Language Processing}, publisher = {}, editor = {Ruslan Milkov and Nicolas Nicolov and Nilokai Nikolov}, address = {Tzigov Chark, BL}, pages = {}, year = {1997}, url = {http://xxx.unizar.es/ps/cmp-lg/9709007}, abstract = {Automatic Text Categorization (TC) is a complex and useful task for many natural language applications, and is usually performed through the use of a set of manually classified documents, a training collection. We suggest the utilization of additional resources like lexical databases to increase the amount of information that TC systems make use of, and thus, to improve their performance. Our approach integrates WordNet information with two training approaches through the Vector Space Model. The training approaches we test are the Rocchio (relevance feedback) and the Widrow-Hoff (machine learning) algorithms. Results obtained from evaluation show that the integration of WordNet clearly outperforms training approaches, and that an integrated technique can effectively address the classification of low frequency categories.}, } @inProceedings{deLima98, author = {De Lima, Luciano R. and Laender, Alberto H. and Ribeiro-Neto, Berthier A.}, title = {A hierarchical approach to the automatic categorization of medical documents}, booktitle = {Proceedings of CIKM-98, 7th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Georges Gardarin and James C. French and Niki Pissinou and Kia Makki and Luc Bouganim}, year = {1998}, address = {Bethesda, US}, pages = {132--139}, url = {http://www.acm.org/pubs/articles/proceedings/cikm/288627/p132-de_lima/p132-de_lima.pdf}, abstract = {}, } @inProceedings{Denoyer01, author = {Ludovic Denoyer and Hugo Zaragoza and Patrick Gallinari}, title = {HMM-based Passage Models for Document Classification and Ranking}, booktitle = {Proceedings of ECIR-01, 23rd European Colloquium on Information Retrieval Research}, editor = {}, year = {2001}, address = {Darmstadt, DE}, publisher = {}, pages = {126--135}, url = {http://www-connex.lip6.fr/~denoyer/publications/denoyer-final-ecir01.ps}, abstract = {We present an application of Hidden Markov Models to supervised document classification and ranking. We consider a family of models that take into account the fact that relevant documents may contain irrelevant passages; the originality of the model is that it does not explicitly segment documents but rather considers all possible segmentations in its final score. This model generalizes the multinomial Naive Bayes and it is derived from a more general model for different access tasks. The model is evaluated on the REUTERS test collection and compared to the multinomial Naive Bayes model. It is shown to be more robust with respect to the training set size and to improve the performance both for ranking and classification, specially for classes with few training examples.}, } @inProceedings{Denoyer03, author = {Ludovic Denoyer and Patrick Gallinari}, title = {A Belief Networks-Based Generative Model for Structured Documents. An Application to the XML Categorization}, booktitle = {Proceedings of MLDM-03, 3rd International Conference on Machine Learning and Data Mining in Pattern Recognition}, editor = {Petra Perner and Azriel Rosenfeld}, year = {2003}, address = {Leipzig, DE}, publisher = {Springer Verlag, Heidelberg, DE}, pages = {328--342}, url = {http://www.springerlink.com/openurl.asp?genre=article&issn=0302-9743&volume=2734&spage=328}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2734}, abstract = {We present a generative Bayesian model for the modeling of structured (e.g. XML) documents. This model allows us to simultaneously take into account structure and content information. It is used here for classifying XML documents. We adopt a machine learning approach and the model parameters are learned from a labeled training set of representative documents. We discuss the role of structural information for classification and describe experiments on a small collection of class labeled structured documents. We also present preliminary results showing how this model could classify documents with DTDs not represented in the training set.}, } @inProceedings{Denoyer03a, author = {Ludovic Denoyer and Jean-No{\"{e}}l Vittaut and Patrick Gallinari and Sylvie Brunessaux and Stephan Brunessaux}, title = {Structured multimedia document classification}, booktitle = {Proceedings of DOCENG-03, ACM Symposium on Document engineering}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {Grenoble, FR}, pages = {153--160}, url = {http://doi.acm.org/10.1145/958220.958249}, abstract = {We propose a new statistical model for the classification of structured documents and consider its use for multimedia document classification. Its main originality is its ability to simultaneously take into account the structural and the content information present in a structured document, and also to cope with different types of content (text, image, etc). We present experiments on the classification of multilingual pornographic HTML pages using text and image data. The system accurately classifies porn sites from 8 European languages. This corpus has been developed by EADS company in the context of a large Web site filtering application.}, } @article{Denoyer04, author = {Ludovic Denoyer and Patrick Gallinari}, title = {Bayesian network model for semi-structured document classification}, journal = {Information Processing and Management}, year = {2004}, volume = {40}, number = {5}, pages = {807--827}, url = {}, abstract = {}, } @article{deVel01, author = {Olivier Y. {De Vel} and Alison Anderson and Malcolm Corney and George M. Mohay}, title = {Mining Email Content for Author Identification Forensics}, journal = {SIGMOD Record}, year = {2001}, volume = {30}, number = {4}, pages = {55--64}, url = {}, abstract = {We describe an investigation into e-mail content mining for author identification, or authorship attribution, for the purpose of forensic investigation. We focus our discussion on the ability to discriminate between authors for the case of both aggregated e-mail topics as well as across different email topics. An extended set of e-mail document features including structural characteristics and linguistic patterns were derived and, together with a Support Vector Machine learning algorithm, were used for mining the e-mail content. Experiments using a number of e-mail documents generated by different authors on a set of topics gave promising results for both aggregated and multi-topic author categorisation.}, } @inProceedings{Dhillon02, author = {Inderjit Dhillon and Subramanyam Mallela and Rahul Kumar}, title = {Enhanced word clustering for hierarchical text classification}, booktitle = {Proceedings of KDD-02, 8th ACM International Conference on Knowledge Discovery and Data Mining}, publisher = {ACM Press, New York, US}, editor = {}, year = {2002}, address = {Edmonton, CA}, pages = {191--200}, url = {}, abstract = {In this paper we propose a new information-theoretic divisive algorithm for word clustering applied to text classification. In previous work, such "distributional clustering" of features has been found to achieve improvements over feature selection in terms of classification accuracy, especially at lower number of features [2, 28]. However the existing clustering, techniques are agglomerative in nature and result in (i) suboptimal word clusters and (ii) high computational cost. In order to explicitly capture the optimality of word clusters in an information theoretic framework, we first derive a global criterion for feature clustering. We then present a fast, divisive algorithm that monotonically decreases this objective function value, thus converging to a local minimum. We show that our algorithm minimizes the "within-cluster Jensen-Shannon divergence" while simultaneously maximizing the "between-cluster Jensen-Shannon divergence". In comparison to the previously proposed agglomerative strategies our divisive algorithm achieves higher classification accuracy especially at lower number of features. We further show that feature clustering is an effective technique for building smaller class models in hierarchical classification. We present detailed experimental results using Naive Bayes and Support Vector Machines on the 20 Newsgroups data set and a 3-level hierarchy of HTML documents collected from Dmoz Open Directory.}, } @article{Dhillon03, author = {Inderjit Dhillon and Subramanyam Mallela and Rahul Kumar}, title = {A divisive information-theoretic feature clustering algorithm for text classification}, journal = {Journal of Machine Learning Research}, volume = {3}, month = {March}, pages = {1265--1287}, year = {2003}, url = {http://www.jmlr.org/papers/volume3/dhillon03a/dhillon03a.pdf}, abstract = {High dimensionality of text can be a deterrent in applying complex learners such as Support Vector Machines to the task of text classification. Feature clustering is a powerful alternative to feature selection for reducing the dimensionality of text data. In this paper we propose a new information-theoretic divisive algorithm for feature/word clustering and apply it to text classification. Existing techniques for such "distributional clustering" of words are agglomerative in nature and result in (i) sub-optimal word clusters and (ii) high computational cost. In order to explicitly capture the optimality of word clusters in an information theoretic framework, we first derive a global criterion for feature clustering. We then present a fast, divisive algorithm that monotonically decreases this objective function value. We show that our algorithm minimizes the "within-cluster Jensen-Shannon divergence" while simultaneously maximizing the "between-cluster Jensen-Shannon divergence". In comparison to the previously proposed agglomerative strategies our divisive algorithm is much faster and achieves comparable or higher classification accuracies. We further show that feature clustering is an effective technique for building smaller class models in hierarchical classification. We present detailed experimental results using Naive Bayes and Support Vector Machines on the 20Newsgroups data set and a 3-level hierarchy of HTML documents collected from the Open Directory project (www.dmoz.org).}, } @inProceedings{Diao00, author = {Yanlei Diao and Hongjun Lu and Dekai Wu}, title = {A comparative study of classification-based personal e-mail filtering}, booktitle = {Proceedings of PAKDD-00, 4th Pacific-Asia Conference on Knowledge Discovery and Data Mining}, editor = {Takao Terano and Huan Liu and Arbee L.P. Chen}, pages = {408--419}, year = {2000}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Kyoto, JP}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1805}, url = {http://www.cs.berkeley.edu/~diaoyl/publications/pakdd00.ps}, abstract = {This paper addresses personal E-mail filtering by casting it in the framework of text classification. Modeled as semi-structured documents, E-mail messages consist of a set of fields with predefined semantics and a number of variable length free-text fields. While most work on classification either concentrates on structured data or free text, the work in this paper deals with both of them. To perform classification, a naive Bayesian classifier was designed and implemented, and a decision tree based classifier was implemented. The design considerations and implementation issues are discussed. Using a relatively large amount of real personal E-mail data, a comprehensive comparative study was conducted using the two classifiers. The importance of different features is reported. Results of other issues related to building an effective personal E-mail classifier are presented and discussed. It is shown that both classifiers can perform filtering with reasonable accuracy. While the decision tree based classifier outperforms the Bayesian classifier when features and training size are selected optimally for both, a carefully designed naive Bayesian classifier is more robust.}, } @inProceedings{Diaz98, author = {D{\'{\i}}az Esteban, Alberto and De Buenaga Rodr{\'{\i}}guez, Manuel and Ure{\~n}a L{\'o}pez, L. Alfonso and Garc{\'{\i}}a Vega, Manuel}, title = {Integrating Linguistic Resources in an Uniform Way for Text Classification Tasks}, booktitle = {Proceedings of LREC-98, 1st International Conference on Language Resources and Evaluation}, publisher = {}, editor = {Antonio Rubio and Natividad Gallardo and Rosa Castro and Antonio Tejada}, address = {Grenada, ES}, pages = {1197--1204}, year = {1998}, url = {http://www.esi.uem.es/laboratorios/sinai/postscripts/lrec98.ps}, abstract = {Applications based on automatic text classification tasks, like text categorization (TC), word sense disambiguation (WSD), text filtering or routing, monolingual or multilingual information retrieval, and text summarization could obtain serious improvements by integrating linguistic resources in the current methods. We present an approach using the Vector Space Model to integrate two different kind of resources: a lexical database and training collections, in text content analysis tasks. The training approaches we test are the Rocchio (relevance feedback) and the Widrow-Hoff (machine learning) algorithms and WordNet as the lexical database. We have delevoped experimental systems for TC and WSD. Results obtained from evaluation show that the integration of WordNet can outperform approaches based only on training.}, } @article{Diederich03, author = {Diederich, Joachim and Kindermann, J{\"{o}}rg and Leopold, Edda and Paass, Gerhard}, title = {Authorship Attribution with Support Vector Machines}, journal = {Applied Intelligence}, year = {2003}, volume = {19}, number = {1/2}, pages = {109--123}, url = {http://ipsapp007.kluweronline.com/content/getfile/4504/36/6/abstract.htm}, abstract = {In this paper we explore the use of text-mining methods for the identification of the author of a text. We apply the support vector machine (SVM) to this problem, as it is able to cope with half a million of inputs it requires no feature selection and can process the frequency vector of all words of a text. We performed a number of experiments with texts from a German newspaper. With nearly perfect reliability the SVM was able to reject other authors and detected the target author in 60-80\% of the cases. In a second experiment, we ignored nouns, verbs and adjectives and replaced them by grammatical tags and bigrams. This resulted in slightly reduced performance. Author detection with SVMs on full word forms was remarkably robust even if the author wrote about different topics.}, } @inProceedings{DiNunzio03, author = {Giorgio M. {Di Nunzio} and Alessandro Micarelli}, title = {Does a New Simple Gaussian Weighting Approach Perform Well in Text Categorization?}, booktitle = {Proceedings of IJCAI-03, 18th International Joint Conference on Artificial Intelligence}, editor = {Georg Gottlob and Toby Walsh}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = {2003}, address = {Acapulco, MX}, pages = {581--586}, url = {}, abstract = {A new approach to the Text Categorization problem is here presented. It is called Gaussian Weighting and it is a supervised learning algorithm that, during the training phase, estimates two very simple and easily computable statistics which are: the Presence \emphP, how much a term \emph{t} is present in a category \emph{c}; the Expressiveness \emphE, how much \emph{t} is present outside \emph{c} in the rest of the domain. Once the system has learned this information, a Gaussian function is shaped for each term of a category, in order to assign the term a weight that estimates the level of its importance for that particular category. We tested our learning method on the task of single-label classification using the Reuters-21578 benchmark. The outcome of the result was quite impressive: in different experimental setups, we reached a micro-averaged F1-measure of 0.89, with a peak of 0.899. Moreover, a macro-averaged Recall and Precision was calculated: the former reported a 0.72, the latter a 0.79. These results reach most of the state-of-the-art techniques of machine learning applied to Text Categorization, demonstrating that this new weighting scheme does perform well on this particular task.}, } @inProceedings{DiNunzio04, author = {Giorgio M. {Di Nunzio}}, title = {A Bidimensional View of Documents for Text Categorisation}, booktitle = {Proceedings of ECIR-04, 26th European Conference on Information Retrieval Research}, editor = {Sharon McDonald and John Tait}, year = {2004}, address = {Sunderland, UK}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2997}, pages = {112--126}, url = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2997&spage=112}, abstract = {The question addressed in this paper is to find a bidimensional representation of textual documents for the problem of text categorisation. The projection of documents is performed following subsequent steps. The main idea is to consider a possible double aspect of the importance of a word: the local importance in a category, and the global importance in the rest of the categories. This information is combined properly and summarized in two coordinates. Then, a machine learning method may be used in this simple bidimensional space to classify the documents. The results that can be obtained in this space are satisfactory with respect to the best state-of-the-art performances.}, } @inProceedings{Dorre99, author = {Jochen D{\"o}rre and Peter Gerstl and Roland Seiffert}, title = {Text mining: finding nuggets in mountains of textual data}, booktitle = {Proceedings of KDD-99, 5th ACM International Conference on Knowledge Discovery and Data Mining}, publisher = {ACM Press, New York, US}, editor = {}, year = {1999}, address = {San Diego, US}, pages = {398--401}, url = {http://www.acm.org/pubs/articles/proceedings/ai/312129/p398-dorre/p398-dorre.pdf}, abstract = {Text mining applies the same analytical functions of data mining to the domain of textual information, relying on sophisticated text analysis techniques that distill information from free-text documents. IBM's Intelligent Miner for Text provides the necessary tools to unlock the business information that is ''trapped'' in email, insurance claims, news feeds, or other document repositories. It has been successfully applied in analyzing patent portfolios, customer complaint letters, and even competitors' Web pages. After defining our notion of ``text mining'', we focus on the differences between text and data mining and describe in some more detail the unique technologies that are key to successful text mining.}, } @article{Doyle65, author = {Lauren B. Doyle}, title = {Is automatic classification a reasonable application of statistical analysis of text?}, journal = {Journal of the ACM}, volume = {12}, number = {4}, year = {1965}, pages = {473--489}, url = {http://www.acm.org/pubs/articles/journals/jacm/1965-12-4/p473-doyle/p473-doyle.pdf}, abstract = {The statistical approach to the analysis of document collections and retrieval therefrom has proceeded along two main lines, associative machine searching and automatic classification. The former approach has been favored because of the tendency of people in the computer field to strive for new methods of dealing with the literature -- methods which do not resemble those of traditional libraries. But automatic classification study also has been thriving; some of the reasons for this are discussed. The crucial question of the quality of automatic classification is treated at considerable length, and empirical data are introduced to support the hypothesis that classification quality improves as more information about each document is used for input to the classification program. Six nonjudgmental criteria are used in testing the hypothesis for 100 keyword lists (each list representing a document) for a series of computer runs in which the number of words per document is increased progressively from 12 to 36. Four of the six criteria indicate the hypothesis holds, and two point to no effect. Previous work of this kind has been confined to the range of one through eight words per document. Finally, the future of automatic classification and some of the practical problems to be faced are outlined.}, } @article{Drucker99, author = {Harris Drucker and Vladimir Vapnik and Dongui Wu}, title = {Support vector machines for spam categorization}, journal = {IEEE Transactions on Neural Networks}, year = {1999}, number = {5}, volume = {10}, pages = {1048--1054}, url = {http://www.monmouth.edu/~drucker/SVM_spam_article_compete.PDF}, abstract = {We study the use of Support Vector Machines (SVMs) in classifying email as spam or nonspam by comparing it to three other classification algorithms: Ripper, Rocchio, and boosting decision trees. These four algorithms were tested on two different data sets: one data set where the number of features were constrained to the 1000 best features and another data set where the dimensionality was over 7000. SVMs performed best when using binary features. For both data sets, boosting trees and SVMs had acceptable test performance in terms of accuracy and speed. However, SVMs had significantly less training time.} } @inProceedings{Dumais:2000:HCW, author = {Susan T. Dumais and Hao Chen}, title = {Hierarchical classification of Web content}, booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong}, publisher = {ACM Press, New York, US}, address = {Athens, GR}, year = {2000}, pages = {256--263}, url = {http://research.microsoft.com/~sdumais/sigir00.pdf}, abstract = {This paper explores the use of hierarchical structure for classifying a large, heterogeneous collection of web content. The hierarchical structure is initially used to train different second-level classifiers. In the hierarchical case, a model is learned to distinguish a second-level category from other categories within the same top level. In the flat non-hierarchical case, a model distinguishes a second-level category from all other second-level categories. Scoring rules can further take advantage of the hierarchy by considering only second-level categories that exceed a threshold at the top level. We use support vector machine (SVM) classifiers, which have been shown to be efficient and effective for classification, but not previously explored in the context of hierarchical classification. We found small advantages in accuracy for hierarchical models over flat models. For the hierarchical approach, we found the same accuracy using a sequential Boolean decision rule and a multiplicative decision rule. Since the sequential approach is much more efficient, requiring only 14\%-16\% of the comparisons used in the other approaches, we find it to be a good choice for classifying text into large hierarchical structures.} } @inProceedings{Dumais98, author = {Susan T. Dumais and John Platt and David Heckerman and Mehran Sahami}, title = {Inductive learning algorithms and representations for text categorization}, booktitle = {Proceedings of CIKM-98, 7th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Georges Gardarin and James C. French and Niki Pissinou and Kia Makki and Luc Bouganim}, year = {1998}, address = {Bethesda, US}, pages = {148--155}, url = {http://robotics.stanford.edu/users/sahami/papers-dir/cikm98.pdf}, abstract = {Text categorization - the assignment of natural language texts to one or more predefined categories based on their content - is an important component in many information organization and management tasks. We compare the effectiveness of five different automatic learning algorithms for text categorization in terms of learning speed, real-time classification speed, and classification accuracy. We also examine training set size, and alternative document representations. Very accurate text classifiers can be learned automatically from training examples. Linear Support Vector Machines (SVMs) are particularly promising because they are very accurate, quick to train, and quick to evaluate.} } @inProceedings{ElYaniv01, author = {Ran El-Yaniv and Oren Souroujon}, title = {Iterative Double Clustering for Unsupervised and Semi-supervised Learning}, booktitle = {Proceedings of ECML-01, 12th European Conference on Machine Learning}, editor = {Luc De Raedt and Peter A. Flach}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Freiburg, DE}, year = {2001}, pages = {121--132}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2167}, url = {http://link.springer.de/link/service/series/0558/papers/2167/21670121.pdf}, abstract = {This paper studies the Iterative Double Clustering (IDC) meta-clustering algorithm, a new extension of the recent Double Clustering (DC) method of Slonim and Tishby that exhibited impressive performance on text categorization tasks. Using synthetically generated data we empirically demonstrate that whenever the DC procedure is successful in recovering some of the structure hidden in the data, the extended IDC procedure can incrementally compute a dramatically better classification, with minor additional computational resources. We demonstrate that the IDC algorithm is especially advantageous when the data exhibits high attribute noise. Our simulation results also show the effectiveness of IDC in text categorization problems. Surprisingly, this unsupervised procedure can be competitive with a (supervised) SVM trained with a small training set. Finally, we propose a natural extension of IDC for (semi-supervised) transductive learning where we are given both labeled and unlabeled examples, and present preliminary empirical results showing the plausibility of the extended method in a semi-supervised setting.}, } @inProceedings{Escudero00, author = {Gerard Escudero and Llu{\'{\i}}s M{\`{a}}rquez and German Rigau}, title = {Boosting applied to word sense disambiguation}, booktitle = {Proceedings of ECML-00, 11th European Conference on Machine Learning}, editor = {Ramon L{\'{o}}pez De M{\'{a}}ntaras and Enric Plaza}, address = {Barcelona, ES}, pages = {129--141}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1810}, year = {2000}, url = {http://www.lsi.upc.es/~escudero/recerca/ecml00.pdf}, abstract = {In this paper Schapire and Singer's AdaBoost.MH boosting algorithm is applied to the Word Sense Disambiguation (WSD) problem. Initial experiments on a set of 15 selected polysemous words show that the boosting approach surpasses Naive Bayes and Exemplar-based approaches, which represent state-of-the-art accuracy on supervised WSD. In order to make boosting practical for a real learning domain of thousands of words, several ways of accelerating the algorithm by reducing the feature space are studied. The best variant, which we call LazyBoosting, is tested on the largest sense-tagged corpus available containing 192,800 examples of the 191 most frequent and ambiguous English words. Again, boosting compares favourably to the other benchmark algorithms.}, } @article{Fall03, author = {C. J. Fall and A. T{\"o}rcsv{\'a}ri and K. Benzineb and G. Karetka}, title = {Automated Categorization in the {International Patent Classification}}, journal = {SIGIR Forum}, year = {2003}, pages = {10--25}, volume = {37}, number = {1}, url = {http://www.acm.org/sigir/forum/S2003/CJF_Manuscript_sigir.pdf}, abstract = {A new reference collection of patent documents for training and testing automated categorization systems is established and described in detail. This collection is tailored for automating the attribution of international patent classification codes to patent applications and is made publicly available for future research work. We report the results of applying a variety of machine learning algorithms to the automated categorization of English-language patent documents. This procedure involves a complex hierarchical taxonomy, within which we classify documents into 114 classes and 451 subclasses. Several measures of categorization success are described and evaluated. We investigate how best to resolve the training problems related to the attribution of multiple classification codes to each patent document.}, } @inProceedings{Fangmeyer68, author = {Hermann Fangmeyer and Gerhard Lustig}, title = {The EURATOM automatic indexing project}, booktitle = {Proceedings of the IFIP Congress (Booklet J)}, publisher = {}, editor = {}, year = {1968}, address = {Edinburgh, UK}, pages = {66--70}, url = {}, abstract = {}, } @inProceedings{Fangmeyer70, author = {Hermann Fangmeyer and Gerhard Lustig}, title = {Experiments with the CETIS automated indexing system}, booktitle = {Proceedings of the Symposium on the Handling of Nuclear Information}, publisher = {International Atomic Energy Agency}, editor = {}, year = {1970}, address = {}, pages = {557--567}, url = {}, abstract = {}, } @inProceedings{Ferilli01, author = {Stefano Ferilli and Nicola Fanizzi and Gianni Semeraro}, title = {Learning logic models for automated text categorization}, booktitle = {Proceedings of AI*IA-01, 7th Congress of the Italian Association for Artificial Intelligence}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2175}, editor = {Floriana Esposito}, year = {2001}, pages = {81--86}, address = {Bari, IT}, url = {http://link.springer.de/link/service/series/0558/papers/2175/21750081.pdf}, abstract = {This work addresses a logical approach to text categorization inside a framework aimed at full automatic paper document processing. The logic representation of sentences required by the adopted learning algorithm is obtained by detecting structure in raw text trough a parser. A preliminary experimentation proved that the logic approach is able to capture the semantics underlying some kind of sentences, even if the assessment of the efficiency of such a method, as well as a comparison with other related approaches, has still to be carried out.}, } @article{Field75, author = {B.J. Field}, title = {Towards automatic indexing: automatic assignment of controlled-language indexing and classification from free indexing}, year = {1975}, journal = {Journal of Documentation}, volume = {31}, number = {4}, pages = {246--265}, url = {}, abstract = {}, } @inProceedings{Finn02, author = {Aidan Finn and Nicholas Kushmerick and Barry Smyth}, title = {Genre Classification and Domain Transfer for Information Filtering}, booktitle = {Proceedings of ECIR-02, 24th European Colloquium on Information Retrieval Research}, editor = {Fabio Crestani and Mark Girolami and Van Rijsbergen, Cornelis J.}, year = {2002}, address = {Glasgow, UK}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2291}, pages = {353--362}, url = {http://www.cs.ucd.ie/staff/nick/home/research/download/finn-ecir2002.ps.gz}, abstract = {The World Wide Web is a vast repository of information, but the sheer volume makes it difficult to identify useful documents. We identify document genre is an important factor in retrieving useful documents and focus on the novel document genre dimension of subjectivity. We investigate three approaches to automatically classifying documents by genre: traditional bag of words techniques, part-of-speech statistics, and hand-crafted shallow linguistic features. We are particularly interested in domain transfer: how well the learned classifiers generalize from the training corpus to a new document corpus. Our experiments demonstrate that the part-of-speech approach is better than traditional bag of words techniques, particularly in the domain transfer conditions.}, } @inProceedings{Fisher03, author = {Michelle Fisher and Richard Everson}, title = {When are links useful? Experiments in text classification}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {41--56}, url = {http://link.springer.de/link/service/series/0558/papers/2633/26330041.pdf}, abstract = {Link analysis methods have become popular for information access tasks, especially information retrieval, where the link information in a document collection is used to complement the traditionally used content information. However, there has been little firm evidence to confirm the utility of link information. We show that link information can be useful when the document collection has a sufficiently high link density and links are of sufficiently high quality. We report experiments on text classification of the Cora and WebKB data sets using Probabilistic Latent Semantic Analysis and Probabilistic Hypertext Induced Topic Selection. Comparison with manually assigned classes shows that link information enhances classification in data with sufficiently high link density, but is detrimental to performance at low link densities or if the quality of the links is degraded. We introduce a new frequency-based method for selecting the most useful citations from a document collection for use in the model.}, } @inProceedings{Forman02, author = {George Forman}, title = {Choose Your Words Carefully: An Empirical Study of Feature Selection Metrics for Text Classification}, booktitle = {Proceedings of PKDD-02, 6th European Conference on Principles of Data Mining and Knowledge Discovery}, editor = {Tapio Elomaa and Heikki Mannila and H. Toivonen}, address = {Helsinki, FI}, pages = {150--162}, year = {2002}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2431}, url = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2431&spage=150}, abstract = {Good feature selection is essential for text classification to make it tractable for machine learning, and to improve classification performance. This study benchmarks the performance of twelve feature selection metrics across 229 text classification problems drawn from Reuters, OHSUMED, TREC, etc. using Support Vector Machines. The results are analyzed for various objectives. For best accuracy, F-measure or recall, the findings reveal an outstanding new feature selection metric, "Bi-Normal Separation" (BNS). For precision alone, however, Information Gain (IG) was superior. A new evaluation methodology is offered that focuses on the needs of the data mining practitioner who seeks to choose one or two metrics to try that are mostly likely to have the best performance for the single dataset at hand. This analysis determined, for example, that IG and Chi-Squared have correlated failures for precision, and that IG paired with BNS is a better choice.}, } @article{Forman03, author = {George Forman}, title = {An Extensive Empirical Study of Feature Selection Metrics for Text Classification}, journal = {Journal of Machine Learning Research}, volume = {3}, month = {March}, pages = {1289--1305}, year = {2003}, url = {http://www.jmlr.org/papers/v3/forman03a.html}, abstract = {Machine learning for text classification is the cornerstone of document categorization, news filtering, document routing, and personalization. In text domains, effective feature selection is essential to make the learning task efficient and more accurate. This paper presents an empirical comparison of twelve feature selection methods (e.g.\ Information Gain) evaluated on a benchmark of 229 text classification problem instances that were gathered from Reuters, TREC, OHSUMED, etc. The results are analyzed from multiple goal perspectives-accuracy, F-measure, precision, and recall-since each is appropriate in different situations. The results reveal that a new feature selection metric we call 'Bi-Normal Separation' (BNS), outperformed the others by a substantial margin in most situations. This margin widened in tasks with high class skew, which is rampant in text classification problems and is particularly challenging for induction algorithms. A new evaluation methodology is offered that focuses on the needs of the data mining practitioner faced with a single dataset who seeks to choose one (or a pair of) metrics that are most likely to yield the best performance. From this perspective, BNS was the top single choice for all goals except precision, for which Information Gain yielded the best result most often. This analysis also revealed, for example, that Information Gain and Chi-Squared have correlated failures, and so they work poorly together. When choosing optimal pairs of metrics for each of the four performance goals, BNS is consistently a member of the pair---e.g., for greatest recall, the pair BNS + F1-measure yielded the best performance on the greatest number of tasks by a considerable margin.}, } @inProceedings{Forman04, author = {George Forman}, title = {A pitfall and solution in multi-class feature selection for text classification}, booktitle = {Proceedings of ICML-04, 21st International Conference on Machine Learning}, editor = {Carla E. Brodley}, year = {2004}, address = {Banff, CA}, pages = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://doi.acm.org/10.1145/1015330.1015356}, abstract = {Information Gain is a well-known and empirically proven method for high-dimensional feature selection. We found that it and other existing methods failed to produce good results on an industrial text classification problem. On investigating the root cause, we find that a large class of feature scoring methods suffers a pitfall: they can be blinded by a surplus of strongly predictive features for some classes, while largely ignoring features needed to discriminate difficult classes. In this paper we demonstrate this pitfall hurts performance even for a relatively uniform text classification task. Based on this understanding, we present solutions inspired by round-robin scheduling that avoid this pitfall, without resorting to costly wrapper methods. Empirical evaluation on 19 datasets shows substantial improvements.}, } @inProceedings{Forman04a, author = {George Forman and Ira Cohen}, title = {Learning from little: comparison of classifiers given little training}, booktitle = {Proceedings of PKDD-04, 8th European Conference on Principles of Data Mining and Knowledge Discovery}, editor = {Jean-Fran{\c{c}}ois Boulicaut and Floriana Esposito and Fosca Giannotti and Dino Pedreschi}, address = {Pisa, IT}, pages = {161--172}, year = {2004}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 3202}, url = {}, abstract = {}, } @inCollection{Forsyth99, author = {Richard S. Forsyth}, title = {New directions in text categorization}, editor = {Alex Gammerman}, booktitle = {Causal models and intelligent data management}, publisher = {Springer Verlag}, address = {Heidelberg, DE}, year = {1999}, pages = {151--185}, url = {}, abstract = {}, } @inProceedings{Frank00, author = {Eibe Frank and Chang Chui and Ian H. Witten}, title = {Text Categorization Using Compression Models}, booktitle = {Proceedings of DCC-00, IEEE Data Compression Conference}, editor = {Storer, James A. and Cohn, Martin}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, year = {2000}, address = {Snowbird, US}, pages = {200--209}, url = {http://dlib.computer.org/conferen/dcc/0592/pdf/05920555.pdf}, abstract = {Text categorization is the assignment of natural language texts to predefined categories based on their content. It has often been observed that compression seems to provide a very promising approach to categorization. The overall compression of an article with respect to different models can be compared to see which one it fits most closely. Such a scheme has several potential advantages because it does not require any pre-processing of the input text. We have performed extensive experiments on the use of PPM compression models for categorization using the standard Reuters-21578 dataset. We obtained some encouraging results on two-category situations, and the results on the general problem seem reasonably impressive---in one case outstanding. However, we find that PPM does not compete with the published state of the art in the use of machine learning for text categorization. It produces inferior results because it is insensitive to subtle differences between articles that belong to a category and those that do not. We do not believe our results are specific to PPM. If the occurrence of a single word determines whether an article belongs to a category or not (and it often does) any compression scheme will likely fail to classify the article correctly. Machine learning schemes fare better because they automatically eliminate irrelevant features and concentrate on the most discriminating ones.}, } @inProceedings{Frasconi01, author = {Paolo Frasconi and Giovanni Soda and Alessandro Vullo}, title = {Text Categorization for Multi-page Documents: A Hybrid Naive {Bayes HMM} Approach}, booktitle = {Proceedings of JCDL, 1st ACM-IEEE Joint Conference on Digital Libraries}, editor = {}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, year = {2001}, address = {Roanoke, US}, pages = {11--20}, url = {http://www.dsi.unifi.it/~paolo/ps/jcdl01-hmm-text.pdf}, abstract = {Text categorization is typically formulated as a concept learning problem where each instance is a single isolated document. In this paper we are interested in a more general formulation where documents are organized as page sequences, as naturally occurring in digital libraries of scanned books and magazines. We describe a method for classifying pages of sequential OCR text documents into one of several assigned categories and suggest that taking into account contextual information provided by the whole page sequence can significantly improve classification accuracy. The proposed architecture relies on hidden Markov models whose emissions are bag-of-words according to a multinomial word event model, as in the generative portion of the Naive Bayes classifier. Our results on a collection of scanned journals from the Making of America project confirm the importance of using whole page sequences. Empirical evaluation indicates that the error rate (as obtained by running a plain Naive Bayes classifier on isolated page) can be roughly reduced by half if contextual information is incorporated.}, } @article{Frasconi02, author = {Paolo Frasconi and Giovanni Soda and Alessandro Vullo}, title = {Text Categorization for Multi-page Documents: A Hybrid Naive {Bayes HMM} Approach}, journal = {Journal of Intelligent Information Systems}, year = {2002}, note = {Special Issue on Automated Text Categorization}, volume = {18}, number = {2/3}, pages = {195--217}, url = {http://www.wkap.nl/article.pdf?391247}, abstract = {In the traditional setting, text categorization is formulated as a concept learning problem where each instance is a single isolated document. However, this perspective is not appropriate in the case of many digital libraries that offer as contents scanned and optically read books or magazines. In this paper, we propose a more general formulation of text categorization, allowing documents to be organized as \textit{sequences} of pages. We introduce a novel hybrid system specifically designed for multi-page text documents. The architecture relies on hidden Markov models whose emissions are bag-of-words resulting from a multinomial word event model, as in the generative portion of the Naive Bayes classifier. The rationale behind our proposal is that taking into account contextual information provided by the whole page sequence can help disambiguation and improves single page classification accuracy. Our results on two datasets of scanned journals from the Making of America collection confirm the importance of using whole page sequences. The empirical evaluation indicates that the error rate (as obtained by running the Naive Bayes classifier on isolated pages) can be significantly reduced if contextual information is incorporated.}, } @inProceedings{Frommholz01, author = {Ingo Frommholz}, title = {Categorizing Web Documents in Hierarchical Catalogues}, booktitle = {Proceedings of ECIR-01, 23rd European Colloquium on Information Retrieval Research}, editor = {}, year = {2001}, address = {Darmstadt, DE}, publisher = {}, pages = {}, url = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Frommholz:01a.pdf}, abstract = {Automatic categorization of web documents (e.g. HTML documents) denotes the task of automatically finding relevant categories for a (new) document which is to be inserted into a web catalogue like Yahoo!. There exist many approaches for performing this difficult task. Here, special kinds of web catalogues, those whose category scheme is hierarchically ordered, are regarded. A method for using the knowledge about the hierarchy to gain better categorization results is discussed. This method can be applied in a post-processing step and therefore be combined with other known (non-hierarchical) categorization approaches.}, } @inProceedings{Fuhr84, author = {Fuhr, Norbert and Knorz, Gerhard}, title = {Retrieval test evaluation of a rule-based automated indexing {(AIR/PHYS)}}, booktitle = {Proceedings of SIGIR-84, 7th ACM International Conference on Research and Development in Information Retrieval}, year = {1984}, publisher = {Cambridge University Press}, editor = {Van Rijsbergen, Cornelis J.}, pages = {391--408}, address = {Cambridge, UK}, url = {}, abstract = {}, } @inProceedings{Fuhr85, author = {Fuhr, Norbert}, title = {A probabilistic model of dictionary-based automatic indexing}, booktitle = {Proceedings of RIAO-85, 1st International Conference ``Recherche d'Information Assistee par Ordinateur''}, publisher = {}, editor = {}, address = {Grenoble, FR}, year = {1985}, pages = {207--216}, url = {}, abstract = {}, } @inProceedings{Fuhr91a, author = {Fuhr, Norbert and Hartmann, Stephan and Knorz, Gerhard and Lustig, Gerhard and Schwantner, Michael and Tzeras, Konstadinos}, title = {{AIR/X} -- a Rule-Based Multistage Indexing System for Large Subject Fields}, booktitle = {Proceedings of RIAO-91, 3rd International Conference ``Recherche d'Information Assistee par Ordinateur''}, publisher = {Elsevier Science Publishers, Amsterdam, NL}, editor = {Andr{\'e} Lichnerowicz}, address = {Barcelona, ES}, year = {1991}, pages = {606--623}, url = {http://www.darmstadt.gmd.de/~tzeras/FullPapers/gz/Fuhr-etal-91.ps.gz}, abstract = {AIR/X is a rule-based system for indexing with terms (descriptors) from a prescribed vocabulary. For this task, an indexing dictionary with rules for mapping terms from the text onto descriptors is required, which can be derived automatically from a set of manually indexed documents. Based on the Darmstadt Indexing Approach, the indexing task is divided into a description step and a decision step. First, terms (single words or phrases) are identified in the document text. With term-descriptor rules from the dictionary, descriptor indications are formed. The set of all indications from a document leading to the same descriptor is called a relevance description. A probabilistic classification procedure computes indexing weights for each relevance description. Since the whole system is rule-based, it can be adapted to different subject fields by appropriate modifications of the rule bases. A major application of AIR/X is the AIR/PHYS system developed for a large physics database. This application is described in more detail along with experimental results.}, } @inProceedings{Fuhr91b, author = {Norbert Fuhr and Ulrich Pfeifer}, title = {Combining Model-Oriented and Description-Oriented Approaches for Probabilistic Indexing}, booktitle = {Proceedings of SIGIR-91, 14th ACM International Conference on Research and Development in Information Retrieval}, editor = {Abraham Bookstein and Yves Chiaramella and Gerard Salton and Vijay V. Raghavan}, publisher = {ACM Press, New York, US}, address = {Chicago, US}, pages = {46--56}, year = {1991}, note = {An extended version appears as~\cite{Fuhr94}}, url = {http://www.acm.org/pubs/articles/proceedings/ir/122860/p46-fuhr/p46-fuhr.pdf}, abstract = {We distinguish model-oriented and description-oriented approaches in probabilistic information retrieval. The former refer to certain representations of documents and queries and use additional independence assumptions, whereas the latter map documents and queries onto feature vectors which form the input to certain classification procedures or regression methods. Description-oriented approaches are more flexible with respect to the underlying representations, but the definition of the feature vector is a heuristic step. In this paper, we combine a probabilistic model for the Darmstadt Indexing Approach with logistic regression. Here the probabilistic model forms a guideline for the definition of the feature vector. Experiments with the purely theoretical approach and with several heuristic variations show that heuristic assumptions may yield significant improvements.}, } @article{Fuhr94, author = {Norbert Fuhr and Ulrich Pfeifer}, title = {Probabilistic Information Retrieval as Combination of Abstraction Inductive Learning and Probabilistic Assumptions}, journal = {ACM Transactions on Information Systems}, year = {1994}, number = {1}, volume = {12}, pages = {92--115}, url = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Fuhr_Pfeifer:94.ps.gz}, abstract = {We show that former approaches in probabilistic information retrieval are based on one or two of the three concepts abstraction, inductive learning and probabilistic assumptions, and we propose a new approach which combines all three concepts. This approach is illustrated for the case of indexing with a controlled vocabulary. For this purpose, we describe a new probabilistic model #rst, which is then combined with logistic regression, thus yielding a generalization of the original model. Experimental results for the pure theoretical model as well as for heuristic variants are given. Furthermore, linear and logistic regression are compared.}, } @article{Furnkranz02, author = {Johannes F{\"{u}}rnkranz}, title = {Hyperlink Ensembles: A Case Study in Hypertext Classification}, journal = {Information Fusion}, year = {2002}, number = {4}, volume = {3}, pages = {299--312}, url = {}, abstract = {In this paper, we introduce hyperlink ensembles, a novel type of ensemble classifier for classifying hypertext documents. Instead of using the text on a page for deriving features that can be used for training a classifier, we suggest to use portions of texts from all pages that point to the target page. A hyperlink ensemble is formed by obtaining one prediction for each hyperlink that points to a page. These individual predictions for each hyperlink are subsequently combined to a final prediction for the class of the target page. We explore four different ways of combining the individual predictions and four different techniques for identifying relevant text portions. The utility of our approach is demonstrated on a set of Web-pages that relate to Computer Science Departments.}, } @inProceedings{Furnkranz99, author = {Johannes F{\"{u}}rnkranz}, title = {Exploiting Structural Information for Text Classification on the WWW}, booktitle = {Proceedings of IDA-99, 3rd Symposium on Intelligent Data Analysis}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1642}, editor = {David J. Hand and Joost N. Kok and Michael R. Berthold}, address = {Amsterdam, NL}, year = {1999}, pages = {487--497}, url = {http://www.ai.univie.ac.at/~juffi/publications/ida-99.ps.gz}, abstract = {In this paper, we report on a set of experiments that explore the utility of making use of the structural information of WWW documents. Our working hypothesis is that it is often easier to classify a hypertext page using information provided on pages that point to it instead of using information that is provided on the page itself. We present experimental evidence that confirms this hypothesis on a set of Web pages that relate to computer science departments.} } @inProceedings{Gabrilovich:2004:NewsJunkie, author = "Gabrilovich, Evgeniy and Dumais, Susan and Horvitz, Eric", title = "Newsjunkie: {P}roviding Personalized Newsfeeds via Analysis of Information Novelty", booktitle = "Proceedings of the Thirteenth International World Wide Web Conference ({WWW2004})", year = 2004, pages = "482--490", month = "May", address = "New York, NY", publisher = "ACM Press", url = "http://www.cs.technion.ac.il/~gabr/papers/NewsJunkie.pdf", abstract = "We present a principled methodology for filtering news stories by formal measures of information novelty, and show how the techniques can be used to custom-tailor newsfeeds based on information that a user has already reviewed. We review methods for analyzing novelty and then describe NewsJunkie, a system that personalizes news for users by identifying the novelty of stories in the context of stories they have already reviewed. NewsJunkie employs novelty-analysis algorithms that represent articles as words and named entities. The algorithms analyze inter- and intra- document dynamics by considering how information evolves over time from article to article, as well as within individual articles. We review the results of a user study undertaken to gauge the value of the approach over legacy time-based review of newsfeeds, and also to compare the performance of alternate distance metrics that are used to estimate the dissimilarity between candidate new articles and sets of previously reviewed articles." } @inProceedings{Gabrilovich04, author = {Evgeniy Gabrilovich and Shaul Markovitch}, title = {Text categorization with many redundant features: using aggressive feature selection to make {SVM}s competitive with {C4.5}}, booktitle = {Proceedings of ICML-04, 21st International Conference on Machine Learning}, editor = {Carla E. Brodley}, year = {2004}, address = {Banff, CA}, pages = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.technion.ac.il/~gabr/papers/fs-svm.pdf}, abstract = {Text categorization algorithms usually represent documents as bags of words and consequently have to deal with huge numbers of features. Most previous studies found that the majority of these features are relevant for classification, and that the performance of text categorization with support vector machines peaks when no feature selection is performed. We describe a class of text categorization problems that are characterized with many redundant features. Even though most of these features are relevant, the underlying concepts can be concisely captured using only a few features, while keeping all of them has substantially detrimental effect on categorization accuracy. We develop a novel measure that captures feature redundancy, and use it to analyze a large collection of datasets. We show that for problems plagued with numerous redundant features the performance of C4.5 is significantly superior to that of SVM, while aggressive feature selection allows SVM to beat C4.5 by a narrow margin.}, } @inProceedings{Galavotti00, author = {Luigi Galavotti and Fabrizio Sebastiani and Maria Simi}, title = {Experiments on the use of feature selection and negative evidence in automated text categorization}, booktitle = {Proceedings of ECDL-00, 4th European Conference on Research and Advanced Technology for Digital Libraries}, editor = {Jos{\'e} L. Borbinha and Thomas Baker}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1923}, year = {2000}, address = {Lisbon, PT}, pages = {59--68}, url = {http://www.math.unipd.it/~fabseb60/Publications/ECDL00.pdf}, abstract = {We tackle two different problems of {\em text categorization} (TC), namely feature selection and classifier induction. {\em Feature selection} (FS) refers to the activity of selecting, from the set of $r$ distinct features (i.e.\ words) occurring in the collection, the subset of $r'\ll r$ features that are most useful for compactly representing the meaning of the documents. We propose a novel FS technique, based on a simplified variant of the $\chi^2$ statistics. {\em Classifier induction} refers instead to the problem of automatically building a text classifier by learning from a set of documents pre-classified under the categories of interest. We propose a novel variant, based on the exploitation of negative evidence, of the well-known $k$-NN method. We report the results of systematic experimentation of these two methods performed on the standard {\sc Reuters-21578} benchmark.}, } @article{Gale93, author = {William A. Gale and Kenneth W. Church and David Yarowsky}, title = {A method for disambiguating word senses in a large corpus}, journal = {Computers and the Humanities}, year = {1993}, number = {5}, volume = {26}, pages = {415--439}, url = {http://www.research.att.com/~kwc/published_1993_sense.ps}, abstract = {}, } @inProceedings{Gao03, author = {Sheng Gao and Wen Wu and Chin-Hui Lee and Tat-Seng Chua}, title = {A maximal figure-of-merit learning approach to text categorization}, booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on Research and Development in Information Retrieval}, editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David Hawking and Alan Smeaton}, publisher = {ACM Press, New York, US}, address = {Toronto, CA}, year = {2003}, pages = {174--181}, url = {http://doi.acm.org/10.1145/860435.860469}, abstract = {A novel maximal figure-of-merit (MFoM) learning approach to text categorization is proposed. Different from the conventional techniques, the proposed MFoM method attempts to integrate any performance metric of interest (e.g. accuracy, recall, precision, or F1 measure) into the design of any classifier. The corresponding classifier parameters are learned by optimizing an overall objective function of interest. To solve this highly nonlinear optimization problem, we use a generalized probabilistic descent algorithm. The MFoM learning framework is evaluated on the Reuters-21578 task with LSI-based feature extraction and a binary tree classifier. Experimental results indicate that the MFoM classifier gives improved F1 and enhanced robustness over the conventional one. It also outperforms the popular SVM method in micro-averaging F1. Other extensions to design discriminative multiple-category MFoM classifiers for application scenarios with new performance metrics could be envisioned too.}, } @inProceedings{Gaussier02, author = {{\'E}ric Gaussier and Cyril Goutte and Kris Popat and Francine Chen}, title = {A hierarchical model for clustering and categorising documents}, booktitle = {Proceedings of ECIR-02, 24th European Colloquium on Information Retrieval Research}, editor = {Fabio Crestani and Mark Girolami and Van Rijsbergen, Cornelis J.}, year = {2002}, address = {Glasgow, UK}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2291}, pages = {229--247}, url = {http://link.springer.de/link/service/series/0558/papers/2291/22910229.pdf}, abstract = {We propose a new hierarchical generative model for textual data, where words may be generated by topic specific distributions at any level in the hierarchy. This model is naturally well-suited to clustering documents in preset or automatically generated hierarchies, as well as categorising new documents in an existing hierarchy. Training algorithms are derived for both cases, and illustrated on real data by clustering news stories and categorising newsgroup messages. Finally, the generative model may be used to derive a Fisher kernel expressing similarity between documents.}, } @article{Gentili01, author = {G.L. Gentili and Mauro Marinilli and Alessandro Micarelli and Filippo Sciarrone}, title = {Text categorization in an intelligent agent for filtering information on the Web}, journal = {International Journal of Pattern Recognition and Artificial Intelligence}, pages = {527--549}, year = {2001}, number = {3}, volume = {15}, url = {http://www.worldscinet.com/journals/ijprai/15/preserved-docs/1503/S021800140100099X.pdf}, abstract = {This paper presents a text categorization system, capable of analyzing HTML/text documents collected from the Web. The system is a component of a more extensive intelligent agent for adaptive information filtering on the Web. It is based on a hybrid case-based architecture, where two multilayer perceptrons are integrated into a case-based reasoner. An empirical evaluation of the system was performed by means of a confidence interval technique. The experimental results obtained are encouraging and support the choice of a hybrid case-based approach to text categorization.}, } @inProceedings{Geutner93, author = {Petra Geutner and Uli Bodenhausen and Alex Waibel}, title = {Flexibility Through Incremental Learning: Neural Networks for Text Categorization}, booktitle = {Proceedings of WCNN-93, World Congress on Neural Networks}, publisher = {}, editor = {}, year = {1993}, address = {Portland, US}, pages = {24--27}, url = {http://werner.ira.uka.de/papers/speech/1993/WCNN_93_petra_geutner.ps.gz}, abstract = {In this paper we show an adaptive incremental learning algorithm that learns interactively to classify text messages (here: emails) into categories without the need for lengthy batch training runs. The algorithm was evaluated on a large database of email messages that fall into five subjective categories. As control experiment best human categorization performance was established at 79.4\% for this task. The best of all connectionist architectures presented here achieves near human performance (79.1\%). This architecture acquires its language model and dictionary adaptively and hence avoids handcoding of either. The learning algorithm combines an adaptive phase which instantly updates dictionary and weights during interaction and a tuning phase which fine tunes for performance using previously seen data. Such systems can be deployed in various applications where instantaneous interactive learning is necessary such as on-line email or news categorization, text summarization and information filtering in general.}, } @inProceedings{Ghani00, author = {Rayid Ghani}, title = {Using error-correcting codes for text classification}, booktitle = {Proceedings of ICML-00, 17th International Conference on Machine Learning}, editor = {Pat Langley}, year = {2000}, address = {Stanford, US}, pages = {303--310}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cmu.edu/~rayid/mypapers/ecoc-icml.ps}, abstract = {This paper explores in detail the use of Error Correcting Output Coding (ECOC) for learning text classifiers. We show that the accuracy of a Naive Bayes Classifier over text classification tasks can be significantly improved by taking advantage of the error-correcting properties of the code. We also explore the use of different kinds of codes, namely Error-Correcting Codes, Random Codes, and Domain and Data-specific codes and give experimental results for each of them. The ECOC method scales well to large data sets with a large number of classes. Experiments on a real-world data set show a reduction in classification error by up to 66\% over the traditional Naive Bayes Classifier. We also compare our empirical results to semi-theoretical results and find that the two closely agree.}, } @inProceedings{Ghani01, author = {Rayid Ghani and Se{\'{a}}n Slattery and Yiming Yang}, title = {Hypertext Categorization using Hyperlink Patterns and Meta Data}, booktitle = {Proceedings of ICML-01, 18th International Conference on Machine Learning}, editor = {Carla Brodley and Andrea Danyluk}, address = {Williams College, US}, year = {2001}, pages = {178--185}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cmu.edu/~yiming/papers.yy/hypertext-icml01.ps.gz}, abstract = {Hypertext poses new text classification research challenges as hyperlinks, content of linked documents, and meta data about related web sites all provide richer sources of information for hypertext classification that are not available in traditional text classification. We investigate the use of such information for representing web sites, and the effectiveness of different classifiers (Naive Bayes, Nearest Neighbor, and {\sc Foil}) in exploiting those representations. We find that using words in web pages alone often yields suboptimal performance of classifiers, compared to exploiting additional sources of information beyond document content. On the other hand, we also observe that linked pages can be more harmful than helpful when the linked neighborhoods are highly ``noisy'' and that links have to be used in a careful manner. More importantly, our investigation suggests that meta data which is often available, or can be acquired using Information Extraction techniques, can be extremely useful for improving classification accuracy. Finally, the relative performance of the different classifiers being tested gives us insights into the strengths and limitations of our algorithms for hypertext classification.}, } @inProceedings{Ghani01a, author = {Rayid Ghani}, title = {Combining Labeled and Unlabeled data for Text Classification with a Large Number of Categories}, booktitle = {Proceedings of the IEEE International Conference on Data Mining}, editor = {Nick Cercone and Tsau Young Lin and Xindong Wu}, address = {San Jose, US}, year = {2001}, pages = {597--598}, publisher = {IEEE Computer Society, Los Alamitos, US}, url = {http://www.cs.cmu.edu/~rayid/mypapers/icdm01.ps}, abstract = {We develop a framework to incorporate unlabeled data in the Error-Correcting Output Coding (ECOC) setup by decomposing multiclass problems into multiple binary problems and then use Co-Training to learn the individual binary classification problems. We show that our method is especially useful for classification tasks involving a large number of categories where Co-training doesn't perform very well by itself and when combined with ECOC, outperforms several other algorithms that combine labeled and unlabeled data for text classification in terms of accuracy, precision-recall tradeoff, and efficiency.}, } @inProceedings{Ghani02, author = {Rayid Ghani}, title = {Combining Labeled and Unlabeled Data for MultiClass Text Categorization}, booktitle = {Proceedings of ICML-02, 19th International Conference on Machine Learning}, editor = {}, year = {2002}, address = {Sydney, AU}, pages = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.accenture.com/xdoc/en/services/technology/publications/Ghani-ICML02.pdf}, abstract = {Supervised learning techniques for text classification often require a large number of labeled examples to learn accurately. One way to reduce the amount of labeled data required is to develop algorithms that can learn effectively from a small number of labeled examples augmented with a large number of unlabeled examples. Current text learning techniques for combining labeled and unlabeled, such as EM and Co-Training, are mostly applicable for classification tasks with a small number of classes and do not scale up well for large multiclass problems. In this paper, we develop a framework to incorporate unlabeled data in the Error-Correcting Output Coding (ECOC) setup by first decomposing multiclass problems into multiple binary problems and then using Co-Training to learn the individual binary classification problems. We show that our method is especially useful for text classification tasks involving a large number of categories and outperforms other semi-supervised learning techniques such as EM and Co-Training. In addition to being highly accurate, this method utilizes the hamming distance from ECOC to provide high-precision results. We also present results with algorithms other than co-training in this framework and show that co-training is uniquely suited to work well within ECOC.}, } @inProceedings{Giorgetti03, author = {Daniela Giorgetti and Fabrizio Sebastiani}, title = {Multiclass Text Categorization for Automated Survey Coding}, year = {2003}, address = {Melbourne, US}, booktitle = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing}, publisher = {ACM Press, New York, US}, pages = {798--802}, url = {http://www.math.unipd.it/~fabseb60/Publications/SAC03a.pdf}, abstract = {\emph{Survey coding} is the task of assigning a symbolic code from a predefined set of such codes to the answer given in response to an open-ended question in a questionnaire (aka \emph{survey}). We formulate the problem of automated survey coding as a \emph{text categorization} problem, i.e.\ as the problem of learning, by means of supervised machine learning techniques, a model of the association between answers and codes from a training set of pre-coded answers, and applying the resulting model to the classification of new answers. In this paper we experiment with two different learning techniques, one based on na\"{\i}ve Bayesian classification and the other one based on multiclass support vector machines, and test the resulting framework on a corpus of social surveys. The results we have obtained significantly outperform the results achieved by previous automated survey coding approaches.}, } @article{Giorgetti03a, author = {Daniela Giorgetti and Fabrizio Sebastiani}, title = {Automating Survey Coding by Multiclass Text Categorization Techniques}, journal = {Journal of the American Society for Information Science and Technology}, year = {2003}, volume = {54}, number = {12}, pages = {1269--1277}, url = {http://www.math.unipd.it/~fabseb60/Publications/JASIST03.pdf}, abstract = {\emph{Survey coding} is the task of assigning a symbolic code from a predefined set of such codes to the answer given in response to an open-ended question in a questionnaire (aka \emph{survey}). This task is usually carried out in order to group respondents according to a predefined scheme based on their answers. Survey coding has several applications, especially in the social sciences, ranging from the simple classification of respondents to the extraction of statistics on political opinions, health and lifestyle habits, customer satisfaction, brand fidelity, and patient satisfaction. Survey coding is a difficult task, since the code that should be attributed to a respondent based on the answer she has given is a matter of subjective judgment, and thus requires expertise. It is thus unsurprising that this task has traditionally been performed manually, by trained coders. Some attempts have been made at automating this task, most of them based on detecting the similarity between the answer and textual descriptions of the meanings of the candidate codes. We take a radically new stand, and formulate the problem of automated survey coding as a \emph{text categorization} problem, i.e.\ as the problem of learning, by means of supervised machine learning techniques, a model of the association between answers and codes from a training set of pre-coded answers, and applying the resulting model to the classification of new answers. In this paper we experiment with two different learning techniques, one based on na\"{\i}ve Bayesian classification and the other one based on multiclass support vector machines, and test the resulting framework on a corpus of social surveys. The results we have obtained significantly outperform the results achieved by previous automated survey coding approaches.}, } @inProceedings{Glover02, author = {Eric J. Glover and Kostas Tsioutsiouliklis and Steve Lawrence and David M. Pennock and Gary W. Flake}, title = {Using Web structure for classifying and describing Web pages}, booktitle = {Proceedings of WWW-02, International Conference on the World Wide Web}, address = {Honolulu, US}, year = {2002}, pages = {562--569}, publisher = {ACM Press, New York, US}, url = {http://www.cs.princeton.edu/~kt/www02.ps}, abstract = {The structure of the web is increasingly being used to improve organization, search, and analysis of information on the web. For example, Google uses the text in citing documents (documents that link to the target document) for search. We analyze the relative utility of document text, and the text in citing documents near the citation, for classification and description. Results show that the text in citing documents, when available, often has greater discriminative and descriptive power than the text in the target document itself. The combination of evidence from a document and citing documents can improve on either information source alone. Moreover, by ranking words and phrases in the citing documents according to expected entropy loss, we are able to accurately name clusters of web pages, even with very few positive examples. Our results confirm, quantify, and extend previous research using web structure in these areas, introducing new methods for classification and description of pages.}, } @inProceedings{Godbole04, author = {Shantanu Godbole and Abhay Harpale and Sunita Sarawagi and Soumen Chakrabarti}, title = {Document classification through interactive supervision of document and term labels}, booktitle = {Proceedings of PKDD-04, 8th European Conference on Principles of Data Mining and Knowledge Discovery}, editor = {Jean-Fran{\c{c}}ois Boulicaut and Floriana Esposito and Fosca Giannotti and Dino Pedreschi}, address = {Pisa, IT}, pages = {185--196}, year = {2004}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 3202}, url = {}, abstract = {}, } @inProceedings{Goevert99, author = {Norbert G{\"{o}}vert and Mounia Lalmas and Norbert Fuhr}, title = {A probabilistic description-oriented approach for categorising Web documents}, booktitle = {Proceedings of CIKM-99, 8th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {1999}, address = {Kansas City, US}, pages = {475--482}, url = {http://ls6-www.informatik.uni-dortmund.de/ir/publications/1999/Goevert_etal:99.html}, abstract = {The automatic categorisation of web documents is becoming crucial for organising the huge amount of information available in the Internet. We are facing a new challenge due to the fact that web documents have a rich structure and are highly heterogeneous. Two ways to respond to this challenge are (1) using a representation of the content of web documents that captures these two characteristics and (2) using more effective classifiers. Our categorisation approach is based on a probabilistic description-oriented representation of web documents, and a probabilistic interpretation of the k-nearest neighbour classifier. With the former, we provide an enhanced document representation that incorporates the structural and heterogeneous nature of web documents. With the latter, we provide a theoretical sound justification for the various parameters of the k-nearest neighbour classifier. Experimental results show that (1) using an enhanced representation of web documents is crucial for an effective categorisation of web documents, and (2) a theoretical interpretation of the k-nearest neighbour classifier gives us improvement over the standard k-nearest neighbour classifier.}, } @inProceedings{Goldberg95, author = {Goldberg, Jeffrey L.}, title = {CDM: an approach to learning in text categorization}, booktitle = {Proceedings of ICTAI-95, 7th International Conference on Tools with Artificial Intelligence}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {}, address = {Herndon, US}, year = {1995}, pages = {258--265}, url = {}, note = {An extended version appears as~\cite{Goldberg96}}, abstract = {The category discrimination method (CDM) is a new learning algorithm designed for text categorization. The motivation is that there are statistical problems associated with natural language text when it is applied as input to existing machine learning algorithms (too much noise, too many features, skewed distribution). The bases of the CDM are research results about the way that humans learn categories and concepts vis-a-vis contrasting concepts. The essential formula is cue validity borrowed from cognitive psychology, and used to select from all possible single word-based features the `best` predictors of a given category. The hypothesis that CDM`s performance exceeds two non-domain specific algorithms, Bayesian classification and decision tree learners, is empirically tested.}, } @article{Goldberg96, author = {Goldberg, Jeffrey L.}, title = {CDM: an approach to learning in text categorization}, journal = {International Journal on Artificial Intelligence Tools}, year = {1996}, number = {1/2}, volume = {5}, pages = {229--253}, url = {}, abstract = {The Category Discrimination Method (CDM) is a new machine learning algorithm designed specifically for text categorization. The motivation is that there are statistical problems associated with natural language text when it is applied as input to existing machine learning algorithms (too much noise, too many features, skewed distribution). The bases of the CDM are research results about the way that humans learn categories and concepts vis-a-vis contrasting concepts. The essential formula is cue validity borrowed from cognitive psychology, and used to select from all possible single word based features, the best predictors of a given category. The hypothesis that CDM's performance will exceed two non domain specific algorithms, Bayesian classification and decision tree learners, is empirically tested.}, } @inProceedings{Gomez02, author = {G{\'o}mez-Hidalgo, Jos{\'e} M. and De Buenaga Rodr{\'{\i}}guez, Jos{\'e} M. and Ureña L{\'o}pez, Luis A. and Mart{\'{\i}}n Valdivia, Maria T. and Garc{\'{\i}}a Vega, Manuel}, title = {Integrating Lexical Knowledge in Learning-Based Text Categorization}, booktitle = {Proceedings of JADT-02, 6th International Conference on the Statistical Analysis of Textual Data}, publisher = {}, editor = {}, address = {St-Malo, FR}, pages = {}, year = {2002}, url = {http://www.cavi.univ-paris3.fr/lexicometrica/jadt/jadt2002/PDF-2002/gomez_debuenaga_urena_martin_garcia.pdf}, abstract = {Automatic Text Categorization (ATC) is an important task in the field of Information Access. The prevailing approach to ATC is making use of a collection of prelabeled texts for the induction of a document classifier through learning methods. With the increasing availability of lexical resources in electronic form (including Lexical Databases (LDBs), Machine Readable Dictionaries, etc.), there is an interesting opportunity for the integration of them in learning-based ATC. In this paper, we present an approach to the integration of lexical knowledge extracted from the LDB WordNet in learning-based ATC, based on Stacked Generalization (SG). The method we suggest is based on combining the lexical knowledge extracted from the LDB interpreted as a classifier with a learning-based classifier, through SG. We have performed experiments which results show that the ideas we describe are promising and deserve further investigation.}, } @inProceedings{Gomez02a, author = {G{\'o}mez-Hidalgo, Jos{\'e} M.}, title = {Evaluating Cost-Sensitive Unsolicited Bulk Email Categorization}, booktitle = {Proceedings of SAC-02, 17th ACM Symposium on Applied Computing}, editor = {}, address = {Madrid, ES}, pages = {615--620}, year = {2002}, url = {http://doi.acm.org/10.1145/508791.508911}, abstract = {In the recent years, Unsolicited Bulk Email has became an increasingly important problem, with a big economic impact. In this paper, we discuss cost-sensitive Text Categorization methods for UBE filtering. In concrete, we have evaluated a range of Machine Learning methods for the task (C4.5, Naive Bayes, PART, Support Vector Machines and Rocchio), made cost sensitive through several methods (Threshold Optimization, Instance Weighting, and Meta-Cost). We have used the Receiver Operating Characteristic Convex Hull method for the evaluation, that best suits classification problems in which target conditions are not known, as it is the case. Our results do not show a dominant algorithm nor method for making algorithms cost-sensitive, but are the best reported on the test collection used, and approach real-world hand-crafted classifiers accuracy.}, } @inProceedings{Goodman90, author = {Marc Goodman}, title = {{\sc Prism}: a case-based telex classifier}, booktitle = {Proceedings of IAAI-90, 2nd Conference on Innovative Applications of Artificial Intelligence}, publisher = {AAAI Press, Menlo Park, US}, editor = {Alain Rappaport and Reid Smith}, year = {1990}, address = {}, pages = {25--37}, url = {}, abstract = {}, } @article{Gray71, author = {W. A. Gray and A. J. Harley}, title = {Computer-assisted indexing}, journal = {Information Storage and Retrieval}, year = {1971}, volume = {7}, number = {4}, pages = {167--174}, url = {}, abstract = {}, } @inProceedings{Guo04, author = {Gongde Guo and Hui Wang and David A. Bell and Yaxin Bi and Kieran Greer}, title = {An kNN Model-Based Approach and Its Application in Text Categorization}, booktitle = {Proceedings of CICLING-04, 5th International Conference on Computational Linguistics and Intelligent Text Processing}, year = {2004}, editor = {Alexander F. Gelbukh}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Seoul, KO}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2945}, pages = {559--570}, url = {}, abstract = {}, } @inProceedings{Guthrie94, author = {Louise Guthrie and Elbert Walker and Joe A. Guthrie}, title = {Document classification by machine: theory and practice}, booktitle = {Proceedings of COLING-94, 15th International Conference on Computational Linguistics}, publisher = {}, editor = {}, address = {Kyoto, JP}, year = {1994}, pages = {1059--1063}, url = {}, abstract = {}, } @inCollection{Guthrie99, author = {Louise Guthrie and Joe A. Guthrie and James Leistensnider}, title = {Document classification and routing}, booktitle = {Natural language information retrieval}, editor = {Tomek Strzalkowski}, year = {1999}, pages = {289--310}, publisher = {Kluwer Academic Publishers}, address = {Dordrecht, NL}, url = {}, abstract = {}, } @inProceedings{Hadjarian01, author = {Ali Hadjarian and Jerzy Bala and Peter Pachowicz}, title = {Text Categorization through Multistrategy Learning and Visualization}, booktitle = {Proceedings of CICLING-01, 2nd International Conference on Computational Linguistics and Intelligent Text Processing}, year = {2001}, editor = {Alexander Gelbukh}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Mexico City, ME}, note = {Published in the ``Lecture Notes for Computer Science'' series, number 2004}, pages = {423--436}, url = {http://link.springer.de/link/service/series/0558/papers/2004/20040437.pdf}, abstract = {This paper introduces a multistrategy learning approach to the categorization of text documents. The approach benefits from two existing, and in our view complimentary, sets of categorization techniques: those based on Rocchio's algorithm and those belonging to the rule learning class of machine learning algorithms. Visualization is used for the presentation of the output of learning}, } @inProceedings{Hamill78, author = {Hamill, Karen A. and Zamora, Antonio}, title = {An automatic document classification system using pattern recognition techniques}, booktitle = {Proceedings of ASIS-78, 41st Annual Meeting of the American Society for Information Science}, publisher = {American Society for Information Science, Washington, US}, editor = {Everett H. Brenner}, year = {1978}, address = {New York, US}, pages = {152--155}, url = {}, abstract = {}, } @article{Hamill80, author = {Hamill, Karen A. and Zamora, Antonio}, title = {The Use of titles for Automatic Document Classification}, journal = {Journal of the American Society for Information Science}, year = {1980}, number = {6}, pages = {396--402}, volume = {33}, url = {}, abstract = {}, } @inProceedings{Han01, author = {Eui-Hong Han and George Karypis and Vipin Kumar}, title = {Text Categorization Using Weight-Adjusted $k$-Nearest Neighbor Classification}, booktitle = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on Knowledge Discovery and Data Mining}, editor = {David Cheung and Qing Li and Graham Williams}, year = {2001}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Hong Kong, CN}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2035}, pages = {53--65}, url = {http://link.springer.de/link/service/series/0558/papers/2035/20350053.pdf}, abstract = {Text categorization presents unique challenges due to the large number of attributes present in the data set, large number of training samples, attribute dependency, and multi-modality of categories. Existing classification techniques have limited applicability in the data sets of these natures. In this paper, we present a Weight Adjusted k-Nearest Neighbor (WAKNN) classification that learns feature weights based on a greedy hill climbing technique. We also present two performance optimizations of WAKNN that improve the computational performance by a few orders of magnitude, but do not compromise on the classification quality. We experimentally evaluated WAKNN on 52 document data sets from a variety of domains and compared its performance against several classification algorithms, such as C4.5, RIPPER, Naive-Bayesian, PEBLS and VSM. Experimental results on these data sets confirm that WAKNN consistently outperforms other existing classification algorithms.}, } @article{Hanauer96, author = {David Hanauer}, title = {Integration of phonetic and graphic features in poetic text categorization judgements}, journal = {Poetics}, year = {1996}, volume = {23}, number = {5}, pages = {363--380}, url = {}, abstract = {The experiments reported in this paper deal with the relationship between specific formal textual features, i.e. graphic and phonetic information, and the reader's literary educational background in the categorization of poetic texts. In two experiments, the research method of Information Integration Theory was employed in order to test two hypotheses relating to the radical conventionalist and traditional positions on the role of specific formal textual features in the categorization of poetic texts. Twenty subjects from expert or novice literary reading experience backgrounds were, in two experiments, required to rate two parallel sets of graphically and phonetically manipulated poems. The results reveal that subjects are sensitive to the manipulations of graphic and phonetic information and use the same additive information integration rule in making poetic text categorization judgements. The expert literary readers were found to assign significantly higher ratings to all versions of the manipulated poems than the novice readers.}, } @inProceedings{Hayes88, author = {Philip J. Hayes and Laura E. Knecht and Monica J. Cellio}, title = {A news story categorization system}, booktitle = {Proceedings of ANLP-88, 2nd Conference on Applied Natural Language Processing}, publisher = {Association for Computational Linguistics, Morristown, US}, address = {Austin, US}, editor = {}, year = {1988}, pages = {9--17}, url = {}, note = {Reprinted in Karen Sparck Jones and Peter Willett (eds.), ``Readings in Information Retrieval'', Morgan Kaufmann, San Francisco, US, 1997, pp.\ 518--526.}, abstract = {The article describes a pilot version of a commercial application of natural language processing techniques to the problem of categorizing new stories into broad topic categories. The system does not perform a complete semantic or syntactic analyses of the input stories. Its categorizations are dependent on fragmentary recognition using pattern-matching techniques. The fragments it looks for are determined by a set of knowledge-based rules. The accuracy of the system is only slightly lower than that of human categorizers.}, } @inProceedings{Hayes90, author = {Philip J. Hayes and Steven P. Weinstein}, title = {{\sc Construe/Tis}: a system for content-based indexing of a database of news stories}, booktitle = {Proceedings of IAAI-90, 2nd Conference on Innovative Applications of Artificial Intelligence}, publisher = {AAAI Press, Menlo Park, US}, editor = {Alain Rappaport and Reid Smith}, year = {1990}, pages = {49--66}, url = {}, abstract = {}, address = {Boston, US}, } @inProceedings{Hayes90a, author = {Philip J. Hayes and Peggy M. Andersen and Irene B. Nirenburg and Linda M. Schmandt}, title = {{\sc Tcs}: a shell for content-based text categorization}, booktitle = {Proceedings of CAIA-90, 6th IEEE Conference on Artificial Intelligence Applications}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {}, year = {1990}, address = {Santa Barbara, US}, pages = {320--326}, url = {}, abstract = {The kind of application that the text categorization shell, TCS, can produce is characterized. Many of its applications have great commercial value. The design goals for TCS are discussed, and other approaches to text categorization in the light of these goals are examined. The TCS and how it meets its design goals are described, and examples of applications built with TCS are given. A text-categorization application developed with TCS consists of the TCS run-time system and a rule base. The rule base defines what categories the application can assign to texts and contains rules that make the categorization decisions for particular texts. The data-driven nature of TCS allows it is to satisfy fully the requirements of ease of application development, portability to other applications and maintainability.}, } @article{He03, author = {Ji He and Ah-Hwee Tan and Chew-Lim Tan}, title = {On Machine Learning Methods for Chinese Document Categorization}, journal = {Applied Intelligence}, year = {2003}, volume = {18}, number = {3}, pages = {311--322}, url = {http://www.kluweronline.com/issn/0924-669X}, abstract = {This paper reports our comparative evaluation of three machine learning methods, namely k Nearest Neighbor (kNN), SupportVector Machines (SVM), and Adaptive Resonance Associative Map (ARAM) for Chinese document categorization. Based on two Chinese corpora, a series of controlled experiments evaluated their learning capabilities and efficiency in mining text classification knowledge. Benchmark experiments showed that their predictive performance were roughly comparable, especially on clean and well organized data sets. While kNN and ARAM yield better performances than SVM on small and clean data sets, SVM and ARAM significantly outperformed kNN on noisy data. Comparing efficiency, kNN was notably more costly in terms of time and memory than the other two methods. SVM is highly efficient in learning from well organized samples of moderate size, although on relatively large and noisy data the efficiency of SVM and ARAM are comparable.}, } @article{Heaps73, author = {H.S. Heaps}, title = {A theory of relevance for automatic document classification}, year = {1973}, journal = {Information and Control}, volume = {22}, number = {3}, pages = {268-278}, url = {}, abstract = {}, } @inProceedings{Hearst91, author = {Marti A. Hearst}, title = {Noun homograph disambiguation using local context in large corpora}, booktitle = {Proceedings of the 7th Annual Conference of the University of Waterloo Centre for the New Oxford English Dictionary}, publisher = {}, editor = {}, year = {1991}, pages = {1--22}, address = {Oxford, UK}, url = {ftp://parcftp.xerox.com/pub/hearst/oed91.ps.gz}, abstract = {This paper describes an accurate, relatively inexpensive method for the disambiguation of noun homographs using large text corpora. The algorithm checks the context surrounding the target noun against that of previously observed instances and chooses the sense for which the most evidence is found, where evidence consists of a set of orthographic, syntactic, and lexical features. Because the sense distinctions made are coarse, the disambiguation can be accomplished without the expense of knowledge bases or inference mechanisms. An implementation of the algorithm is described which, starting with a small set of hand-labeled instances, improves its results automatically via unsupervised training. The approach is compared to other attempts at homograph disambiguation using both machine readable dictionaries and unrestricted text and the use of training instances is determined to be a crucial difference.}, } @proceedings{Hearst96a, editor = {Marti A. Hearst and Haym Hirsh}, title = {Machine Learning in Information Access. Papers from the 1996 AAAI Spring Symposium}, institution = {Americal Association for Artificial Intelligence}, address = {Stanford, US}, year = {1996}, note = {Available as Technical Report SS-96-05}, url = {}, abstract = {}, } @inProceedings{Hersh94, author = {William Hersh and Christopher Buckley and T.J. Leone and David Hickman}, title = {{{\sc Ohsumed}}: an interactive retrieval evaluation and new large text collection for research}, booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Van Rijsbergen, Cornelis J.}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Dublin, IE}, pages = {192--201}, year = {1994}, url = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p192-hersh/p192-hersh.pdf}, abstract = {A series of information retrieval experiments was carried out with a computer installed in a medical practice setting for relatively inexperienced physician end-users. Using a commercial MEDLINE product based on the vector space model, these physicians searched just as effectively as more experienced searchers using Boolean searching. The results of this experiment were subsequently used to create a new large medical test collection, which was used in experiments with the SMART retrieval system to obtain baseline performance data as well as compare SMART with the other searchers.}, } @inProceedings{Hoashi00, author = {Keiichiro Hoashi and Kazunori Matsumoto and Naomi Inoue and Kazuo Hashimoto}, title = {Document filtering methods using non-relevant information profile}, booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong}, publisher = {ACM Press, New York, US}, address = {Athens, GR}, year = {2000}, pages = {176--183}, url = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p176-hoashi/p176-hoashi.pdf}, abstract = {Document filtering is a task to retrieve documents relevant to a user's profile from a flow of documents. Generally, filtering systems calculate the similarity between the profile and each incoming document, and retrieve documents with similarity higher than a threshold. However, many systems set a relatively high threshold to reduce retrieval of non-relevant documents, which results in the ignorance of many relevant documents. In this paper, we propose the use of a non-relevant information profile to reduce the mistaken retrieval of non-relevant documents. Results from experiments show that this filter has successfully rejected a sufficient number of non-relevant documents, resulting in an improvement of filtering performance.}, } @inProceedings{Hoch94, author = {Rainer Hoch}, title = {Using IR techniques for text classification in document analysis}, booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Van Rijsbergen, Cornelis J.}, publisher = {Springer Verlag, Heidelberg, DE}, year = {1994}, address = {Dublin, IE}, pages = {31--40}, url = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p31-hoch/p31-hoch.pdf}, abstract = {This paper presents the INFOCLAS system applying statistical methods of information retrieval for the classification of German business letters into corresponding message types such as order, offer, enclosure, etc. INFOCLAS is a first step towards the understanding of documents proceeding to a classification-driven extraction of information. The system is composed of two main modules: the central indexer (extraction and weighting of indexing terms) and the classifier (classification of business letters into given types). The system employs several knowledge sources including a letter database, word frequency statistics for German, lists of message type specific words, morphological knowledge as well as the underlying document structure. As output, the system evaluates a set of weighted hypotheses about the type of the actual letter. Classification of documents allow the automatic distribution or archiving of letters and is also an excellent starting point for higher-level document analysis.}, } @article{Hoyle73, author = {W.G. Hoyle}, title = {Automatic indexing and generation of classification by algorithm}, journal = {Information Storage and Retrieval}, year = {1973}, volume = {9}, number = {4}, pages = {233--242}, url = {}, abstract = {A system of automatic indexing based on Bayes' theorem is described briefly. In assigning 124 documents to 9 categories, there were 97 cases of agreement with professional indexers. Using a collection factor, based on 87 per cent human consistency from other courses, the computer appears then to index with 90 per cent accuracy in this case. The technique is then used with two randomized sample document groups drawn from nine categories. Each group in turn is used as the basis for indexing the other. The computer knows only the number of categories. After 8 cycles the computer is found to have formed 9 groups consisting of about 50 per cent of documents that were also lumped together by professional indexers on the basis of subject content. A new measure of performance is proposed and some other applications of the technique indicated.}, } @inProceedings{Hsu99, author = {Wen-Lin Hsu and Sheau-Dong Lang}, title = {Classification algorithms for NETNEWS articles}, booktitle = {Proceedings of CIKM-99, 8th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {1999}, address = {Kansas City, US}, pages = {114--121}, url = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p114-hsu/p114-hsu.pdf}, abstract = {We propose several algorithms using the vector space model to classify the news articles posted on the NETNEWS according to the newsgroup categories. The baseline method combines the terms of all the articles of each newsgroup in the training set to represent the newsgroups as single vectors. After training, the incoming news articles are classified based on their similarity to the existing newsgroup categories. We propose to use the following techniques to improve the classification performance of the baseline method: (1) use routing (classification) accuracy and the similarity values to refine the training set; (2) update the underlying term structures periodically during testing; and (3) apply k-means clustering to partition the newsgroup articles and represent each newsgroup by k vectors. Our test collection consists of the real news articles and the 519 subnewsgroups under the REC newsgroup of NETNEWS in a period of 3 months. Our experimental results demonstrate that the technique of refining the training set reduces from one-third to two-thirds of the storage. The technique of periodical updates improves the routing accuracy ranging from 20\% to 100\% but incurs runtime overhead. Finally, representing each newsgroup by k vectors (with k = 2 or 3) using clustering yields the most significant improvement in routing accuracy, ranging from 60\% to lOO\%, while causing only slightly higher storage requirements.}, } @inProceedings{Hsu99a, author = {Wen-Lin Hsu and Sheau-Dong Lang}, title = {Feature Reduction and Database Maintenance in NETNEWS Classification}, booktitle = {Proceedings of IDEAS-99, 1999 International Database Engineering and Applications Symposium}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {}, year = {1999}, address = {Montreal, CA}, pages = {137--144}, url = {http://dlib.computer.org/conferen/ideas/0265/pdf/02650137.pdf}, abstract = {We propose a statistical feature-reduction technique to filter out the most ambiguous articles in the training data for categorizing the NETNEWS articles. We also incorporate a batch updating scheme to periodically do maintenance on the term structures of the news database after training. The baseline method combines the terms of all the articles of each newsgroup in the training set to represent the newsgroups as single vectors. After training, the incoming news articles are classified based on their similarity to the existing newsgroup categories. Our implementation uses an inverted file to store the trained term structures of each newsgroup, and uses a list similar to the inverted file to buffer the newly arrival articles, for efficient routing and updating purposes. Our experimental results using real NETNEWS articles and newsgroups demonstrate (1) applying feature reduction to the training set improves the routing accuracy, efficiency, and database storage; (2) updating improves the routing accuracy; and (3) the batch technique improves the efficiency of the updating operation.}, } @inProceedings{Huffman94, author = {Stephen Huffman and Marc Damashek}, title = {Acquaintance: A Novel Vector-Space N-Gram Technique for Document Categorization}, booktitle = {Proceedings of TREC-3, 3rd Text Retrieval Conference}, publisher = {National Institute of Standards and Technology, Gaithersburg, US}, editor = {Donna K. Harman}, year = {1994}, address = {Gaithersburg, US}, pages = {305--310}, url = {}, abstract = {Acquaintance is the name of a novel vector-space n-gram technique for categorizing documents. The technique is completely language-independent, highly garble-resistant, and computationally simple. An unoptimized version of the algorithm was used to process the TREC database in a very short time.}, } @inProceedings{Huffman95, author = {Stephen Huffman}, title = {Acquaintance: Language-Independent Document Categorization by N-Grams}, booktitle = {Proceedings of TREC-4, 4th Text Retrieval Conference}, publisher = {National Institute of Standards and Technology, Gaithersburg, US}, editor = {Donna K. Harman and Ellen M. Voorhees}, year = {1995}, address = {Gaithersburg, US}, pages = {359--371}, url = {http://trec.nist.gov/pubs/trec4/papers/nsa.ps.gz}, abstract = {Acquaintance is the name of a novel vector-space n-gram for categorizing documents. The technique is completely language-independent, highly garble-resistant, and computationally simple. An unoptimized version of the algorithm was used to process the TREC database in a very short time. The TREC-3 conference provided the first public demonstration and evaluation of this new technique, and TREC-4 provided an opportunity to test its usefulness on several types of text retrieval tasks.}, } @inProceedings{Hull94, author = {Hull, David A.}, title = {Improving text retrieval for the routing problem using latent semantic indexing}, booktitle = {Proceedings of the 17th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Van Rijsbergen, Cornelis J.}, publisher = {Springer Verlag, Heidelberg, Germany}, year = {1994}, address = {Dublin, Ireland}, pages = {282--289}, url = {\small http://www.acm.org/pubs/articles/proceedings/ir/188490/p282-hull/p282-hull.pdf}, abstract = {Latent Semantic Indexing (LSI) is a novel approach to information retrieval that attempts to model the underlying structure of term associations by transforming the traditional representation of documents as vectors of weighted term frequencies to a new coordinate space where both documents and terms are represented as linear combinations of underlying semantic factors. In previous research, LSI has produced a small improvement in retrieval performance. In this paper, we apply LSI to the routing task, which operates under the assumption that a sample of relevant and non-relevant documents is available to use in constructing the query. Once again, LSI slightly improves performance. However, when LSI is used is conduction with statistical classification, there is a dramatic improvement in performance.} } @inProceedings{Hull96, author = {David A. Hull and Jan O. Pedersen and Hinrich Sch{\"u}tze}, title = {Method combination for document filtering}, booktitle = {Proceedings of SIGIR-96, 19th ACM International Conference on Research and Development in Information Retrieval}, editor = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and Ross Wilkinson}, publisher = {ACM Press, New York, US}, year = {1996}, address = {Z{\"{u}}rich, CH}, pages = {279--288}, url = {ftp://parcftp.xerox.com/pub/qca/papers/sigirfiltering96.ps}, abstract = {There is strong empirical and theoretic evidence that combination of retrieval methods can improve performance. In this paper, we systematically compare combination strategies in the context of document filtering, using queries from the Tipster reference corpus. We find that simple averaging strategies do indeed improve performance, but that direct averaging of probability estimates is not the correct approach. Instead, the probability estimates must be renormalized using logistic regression on the known relevance judgements. We examine more complex combination strategies but find them less successful due to the high correlations among our filtering methods which are optimized over the same training data and employ similar document representations.}, } @inProceedings{Hull98, author = {David A. Hull}, title = {The {TREC-7} filtering track: description and analysis}, booktitle = {Proceedings of TREC-7, 7th Text Retrieval Conference}, publisher = {National Institute of Standards and Technology, Gaithersburg, US}, editor = {Ellen M. Voorhees and Donna K. Harman}, year = {1998}, address = {Gaithersburg, US}, pages = {33--56}, url = {http://trec.nist.gov/pubs/trec7/papers/tr7filter/paper.ps}, abstract = {This article describes the experiments conducted in the TREC-7 filtering track, which consisted of three subtasks: adaptive filtering, batch filtering, and routing. The focus this year is on adaptive filtering, where the system begins with only the topic statement and must interactively adjust a filtering profile constructed from that topic in response to on-line feedback. In addition to motivating the task and describing the practical details of participating in the track, this document includes a detailed graphical presentation of the experimental results and provides a brief overall analysis of the performance data.}, } @inProceedings{Ipeirotis01, author = {Panagiotis G. Ipeirotis and Luis Gravano and Mehran Sahami}, title = {Probe, count, and classify: categorizing hidden Web databases}, booktitle = {Proceedings of SIGMOD-01, ACM International Conference on Management of Data}, editor = {Walid G. Aref}, publisher = {ACM Press, New York, US}, year = {2001}, address = {Santa Barbara, US}, pages = {67--78}, url = {http://doi.acm.org/10.1145/375663.375671}, abstract = {The contents of many valuable web-accessible databases are only accessible through search interfaces and are hence in-visible to traditional web ``crawlers''. Recent studies have estimated the size of this ''hidden web'' to be 500 billion pages, while the size of the ``crawlable'' web is only an es-timated two billion pages. Recently, commercial web sites have started to manually organize web-accessible databases into Yahoo!-like hierarchical classification schemes. In this paper, we introduce a method for automating this classi-fication process by using a small number of query probes. To classify a database, our algorithm does not retrieve or in-spect any documents or pages from the database, but rather just exploits the number of matches that each query probe generates at the database in question. We have conducted an extensive experimental evaluation of our technique over collections of real documents, including over one hundred web-accessible databases. Our experiments show that our system has low overhead and achieves high classification ac-curacy across a variety of databases.}, } @inProceedings{Ittner95, author = {David J. Ittner and Lewis, David D. and David D. Ahn}, title = {Text categorization of low quality images}, booktitle = {Proceedings of SDAIR-95, 4th Annual Symposium on Document Analysis and Information Retrieval}, publisher = {}, editor = {}, year = {1995}, address = {Las Vegas, US}, pages = {301--315}, url = {http://www.research.att.com/~lewis/papers/ittner95.ps}, abstract = {Categorization of text images into content-oriented classes would be a useful capability in a variety of document handling systems. Many methods can be used to categorize texts once their words are known, but OCR can garble a large proportion of words, particularly when low quality images are used. Despite this, we show for one data set that fax quality images can be categorized with nearly the same accuracy as the original text. Further, the categorization system can be trained on noisy OCR output, without need for the true text of any image, or for editing of OCR output. The use of a vector space classifier and training method robust to large feature sets, combined with discarding of low frequency OCR output strings are the key to our approach.}, } @inProceedings{Iwayama94, author = {Makoto Iwayama and Takenobu Tokunaga}, title = {A Probabilistic Model for Text Categorization: Based on a Single Random Variable with Multiple Values}, booktitle = {Proceedings of ANLP-94, 4th Conference on Applied Natural Language Processing}, publisher = {Association for Computational Linguistics, Morristown, US}, editor = {}, year = {1994}, address = {Stuttgart, DE}, pages = {162--167}, url = {}, abstract = {}, } @inProceedings{Iwayama95, author = {Makoto Iwayama and Takenobu Tokunaga}, title = {Cluster-based text categorization: a comparison of category search strategies}, booktitle = {Proceedings of SIGIR-95, 18th ACM International Conference on Research and Development in Information Retrieval}, editor = {Edward A. Fox and Peter Ingwersen and Raya Fidel}, publisher = {ACM Press, New York, US}, year = {1995}, address = {Seattle, US}, pages = {273--281}, url = {http://www.acm.org/pubs/articles/proceedings/ir/215206/p273-iwayama/p273-iwayama.pdf}, abstract = {Text categorization can be viewed as a process of category search, in which one or more categories for a test document are searched for by using given training documents with known categories. A cluster based search with a probabilistic clustering algorithm is proposed and evaluated on two data sets. The efficiency, effectiveness, and noise tolerance of this search strategy were confirmed to be better than those of a full search, a category based search, and a cluster based search with nonprobabilistic clustering.}, } @inProceedings{Iwayama95a, author = {Makoto Iwayama and Takenobu Tokunaga}, title = {Hierarchical Bayesian clustering for automatic text classification}, booktitle = {Proceedings of IJCAI-95, 14th International Joint Conference on Artificial Intelligence}, editor = {Chris E. Mellish}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = {1995}, address = {Montreal, CA}, pages = {1322--1327}, url = {}, abstract = {Text classification, the grouping of texts into several clusters, has been used as a means of improving both the efficiency and the effectiveness of text retrieval/categorization. In this paper we propose a hierarchical clustering algorithm that constructs a set of clusters having the maximum Bayesian posterior probability, the probability that the given texts are classified into clusters. We call the algorithm Hierarchical Bayesian Clustering (HBC). The advantages of HBC are experimentally verified from several viewpoints. HBC can reconstruct the original clusters more accurately than other non-probabilistic algorithms. When a probabilistic text categorization is extended to a cluster-based one, the use of HBC offers better performance than the use of non-probabilistic algorithms.}, } @inProceedings{Iwazume96, author = {Michiaki Iwazume and Hideaki Takeda and Toyoaki Nishida}, title = {Ontology-Based Information Gathering and Text Categorization from the Internet}, booktitle = {Proceedings of IEA/AIE-96, 9th International Conference in Industrial and Engineering Applications of Artificial Intelligence and Expert Systems}, editor = {}, publisher = {}, year = {1996}, address = {Fukuoka, JP}, pages = {305--314}, url = {}, abstract = {}, } @inProceedings{Iyer00, author = {Raj D. Iyer and David D. Lewis and Robert E. Schapire and Yoram Singer and Amit Singhal}, title = {Boosting for Document Routing}, booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, address = {McLean, US}, editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner}, year = {2000}, pages = {70--77}, url = {http://www.cs.huji.ac.il/~singer/papers/rankboost.ps.gz}, abstract = {RankBoost is a recently proposed algorithm for learning ranking functions. It is simple to implement and has strong justifications from computational learning theory. We describe the algorithm and present experimental results on applying it to the document routing problem. The first set of results applies RankBoost to a text representation produced using modern term weighting methods. Performance of RankBoost is somewhat inferior to that of a state-of-the-art routing algorithm which is, however, more complex and less theoretically justified than RankBoost. RankBoost achieves comparable performance to the state-of-the-art algorithm when combined with feature or example selection heuristics. Our second set of results examines the behavior of RankBoost when it has to learn not only a ranking function but also all aspects of term weighting from raw data. Performance is usually, though not always, less good here, but the term weighting functions implicit in the resulting ranking functions are intriguing, and the approach could easily be adapted to mixtures of textual and nontextual data.}, } @inProceedings{Jacobs92, author = {Paul S. Jacobs}, title = {Joining statistics with NLP for text categorization}, booktitle = {Proceedings of ANLP-92, 3rd Conference on Applied Natural Language Processing}, publisher = {Association for Computational Linguistics, Morristown, US}, editor = {Marcia Bates and Oliviero Stock}, year = {1992}, address = {Trento, IT}, pages = {178--185}, url = {}, abstract = {Automatic news categorization systems have produced high accuracy, consistency, and flexibility using some natural language processing techniques. These knowledge-based categorization methods are more powerful and accurate than statistical techniques. However, the phrasal pre-processing and pattern matching methods that seem to work for categorization have the disadvantage of requiring a fair amount of knowledge-encoding by human beings. In addition, they work much better at certain tasks, such as identifying major events in texts, than at others, such as determining what sort of business or product is involved in a news event. Statistical methods for categorization, on the other hand, are easy to implement and require little or no human customization. But they don't offer any of the benefits of natural language processing, such as the ability to identify relationships and enforce linguistic constraints. The authors' approach has been to use statistics in the knowledge acquisition component of a linguistic pattern-based categorization system, using statistical methods, for example, to associate words with industries and identify phrases that information about businesses or products. Instead of replacing knowledge-based methods with statistics, statistical training replaces knowledge engineering. This has resulted in high accuracy, shorter customization time, and good prospects for the application of the statistical methods to problems in lexical acquisition.}, } @article{Jacobs93, author = {Paul S. Jacobs}, title = {Using Statistical Methods to Improve Knowledge-Based News Categorization}, journal = {IEEE Expert}, year = {1993}, number = {2}, volume = {8}, pages = {13--23}, url = {}, abstract = {}, } @inProceedings{Jo99, author = {Taeho C. Jo}, title = {Text categorization with the concept of fuzzy set of informative keywords}, booktitle = {Proceedings of FUZZ-IEEE'99, IEEE International Conference on Fuzzy Systems}, editor = {}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, address = {Seoul, KR}, pages = {609--614}, year = {1999}, url = {}, abstract = {Text categorization is the procedure of assigning a category to a particular document among predefined categories. Informative keywords are the ones which reflect the contents of a document. A document includes informative keywords and non-informative keywords. Mainly non-informative keywords play the roles of grammatical functions in sentences; such keywords, what are called functional keywords, reflect its contents very little, so they should be removed in the process of document indexing. The discrimination between informative keywords and functional keywords is not crisp. In the process of document indexing, a document is represented as a set of informative keywords. In this paper, it is proposed that a document be represented into a fuzzy set of informative keywords, instead of a crisp set of informative keywords. The experiments of the categorization of news articles show that the proposed schemes of text categorization outperform the schemes with crisp sets.}, } @inCollection{Jo99a, author = {Taeho C. Jo}, title = {News article classification based on categorical points from keywords in backdata}, booktitle = {Computational Intelligence for Modelling, Control and Automation}, editor = {Masoud Mohammadian}, publisher = {IOS Press}, address = {Amsterdam, NL}, pages = {211--214}, year = {1999}, url = {}, abstract = {A scheme of automatic document classification is presented. Previously, documents have been classified according to their contents manually. Therefore, it is very costly to assign a category to them because a human investigates their contents. As the amount of data stored in storage media is increased exponentially, it becomes necessary to store documents according to their category, to access them easily. Automatic text classification is needed to store documents like that. Before performing text classification, back data should be constructed. The back data stores the information about keywords: the frequency for each category, the number of documents for each category. A document is represented with a list of keywords. Categorical points to each category are computed by summing the frequency of each keyword from back data, or the number of documents from it. The category that contains the largest categorical points is selected as the category of a document. In the results of an experiment with news article classification, precision is about 98\%.}, } @inCollection{Jo99b, author = {Taeho C. Jo}, title = {News articles classification based on representative keywords of categories}, booktitle = {Computational Intelligence for Modelling, Control and Automation}, editor = {Masoud Mohammadian}, publisher = {IOS Press}, address = {Amsterdam, NL}, pages = {194--198}, year = {1999}, url = {}, abstract = {A scheme of automatic document classification is presented. So far, documents have been classified according to their contents manually. Therefore, it is very costly to assign a category for them because humans investigate their contents. As the amount of data stored in storage media is increased exponentially, it becomes necessary to store documents according to their category, to access them easily. Automatic text classification is necessary to store documents like that. The scheme for automatic text classification proposed in the paper, is based on document indexing, where a document is represented as a list of keywords. The number of common keywords between keywords from the document itself and representative keywords from back data classifies documents. As an example, the proposed scheme is applied to the classification of news articles into 3 categories: politics, sports, and business. The measurements of performance evaluation are: classification rate, correctness rate, and classified correctness rate.}, } @inProceedings{Joachims00, author = {Thorsten Joachims}, title = {Estimating the Generalization Performance of a SVM Efficiently}, booktitle = {Proceedings of ICML-00, 17th International Conference on Machine Learning}, editor = {Pat Langley}, year = {2000}, address = {Stanford, US}, pages = {431--438}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_00a.pdf}, abstract = {This paper proposes and analyzes an efficient and effective approach for estimating the generalization performance of a support vector machine (SVM) for text classification. Without any computation-intensive resampling, the new estimators are computationally much more efficient than cross-validation or bootstrapping. They can be computed at essentially no extra cost immediately after training a single SVM. Moreover, the estimators developed here address the special performance measures needed for evaluating text classifiers. They can be used not only to estimate the error rate, but also to estimate recall, precision, and F1. A theoretical analysis and experiments show that the new method can effectively estimate the performance of SVM text classifiers in an efficient way.}, } @inProceedings{Joachims01b, author = {Thorsten Joachims and Nello Cristianini and John Shawe-Taylor}, title = {Composite Kernels for Hypertext Categorisation}, booktitle = {Proceedings of ICML-01, 18th International Conference on Machine Learning}, editor = {Carla Brodley and Andrea Danyluk}, address = {Williams College, US}, year = {2001}, pages = {250--257}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cornell.edu/People/tj/publications/joachims_etal_01a.pdf}, abstract = {Kernels are problem-specific functions that act as an interface between the learning system and the data. While it is well-known when the combination of two kernels is again a valid kernel, it is an open question if the resulting kernel will perform well. In particular, in which situations can a combination of kernel be expected to perform better than its components considered separately? Intuitively, one would like each of the two kernels to contribute information that is not available to the other. This characterization hence must consider the data at hand, both the kernels and also the task, that is the information given by the labels. We investigate this problem by looking at the task of designing kernels for hypertext classification, where both words and links information can be exploited. Firstly we introduce a novel kernel, whose Gram matrix is the well known co-citation matrix from bibliometrics, and demonstrate on real data that it has a good performance. Then we study the problem of combining it with a standard bag of words kernel. We provide sufficient conditions that indicate when an improvement can be expected, highlighting and formalising the notion of ``independent kernels''. Experimental results confirm the predictions of the theory in the hypertext domain.}, } @inProceedings{Joachims01c, author = {Thorsten Joachims}, title = {A Statistical Learning Model of Text Classification with Support Vector Machines}, booktitle = {Proceedings of SIGIR-01, 24th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin Zobel}, publisher = {ACM Press, New York, US}, address = {New Orleans, US}, year = {2001}, pages = {128--136}, url = {http://www.cs.cornell.edu/People/tj/publications/joachims_01a.pdf}, abstract = {This paper develops a theoretical learning model of text classification for Support Vector Machines (SVMs). It connects the statistical properties of text-classification tasks with the generalization performance of a SVM in a quantitative way. Unlike conventional approaches to learning text classifiers, which rely primarily on empirical evidence, this model explains why and when SVMs perform well for text classification. In particular, it addresses the following questions: Why can support vector machines handle the large feature spaces in text classification effectively? How is this related to the statistical properties of text? What are sufficient conditions for applying SVMs to text-classification problems successfully?}, } @article{Joachims02, author = {Thorsten Joachims and Fabrizio Sebastiani}, title = {Guest editors' introduction to the special issue on automated text categorization}, journal = {Journal of Intelligent Information Systems}, year = {2002}, note = {Special Issue on Automated Text Categorization}, volume = {18}, number = {2/3}, pages = {103--105}, url = {http://www.wkap.nl/article.pdf?391241}, } @book{Joachims02a, author = {Thorsten Joachims}, title = {Learning to Classify Text using Support Vector Machines}, publisher = {Kluwer Academic Publishers}, address = {Dordrecht, NL}, year = {2002}, } @inProceedings{Joachims97, author = {Thorsten Joachims}, title = {A probabilistic analysis of the {Rocchio} algorithm with {TFIDF} for text categorization}, booktitle = {Proceedings of ICML-97, 14th International Conference on Machine Learning}, editor = {Douglas H. Fisher}, year = {1997}, address = {Nashville, US}, pages = {143--151}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_97a.ps.gz}, abstract = {The Rocchio relevance feedback algorithm is one of the most popular and widely applied learning methods from information retrieval. Here, a probabilistic analysis of this algorithm is presented in a text categorization framework. The analysis gives theoretical insight into the heuristics used in the Rocchio algorithm, particularly the word weighting scheme and the similarity metric. It also suggests improvements which lead to a probabilistic variant of the Rocchio classifier. The Rocchio classifier, its probabilistic variant, and a naive Bayes classifier are compared on six text categorization tasks. The results show that the probabilistic algorithms are preferable to the heuristic Rocchio classifier not only because they are more well-founded, but also because they achieve better performance.}, } @inProceedings{Joachims97b, author = {Thorsten Joachims and Dayne Freitag and Tom M. Mitchell}, title = {{\sc WebWatcher}: a tour guide for the Word Wide Web}, booktitle = {Proceedings of IJCAI-97, 15th International Joint Conference on Artificial Intelligence}, editor = {Martha E. Pollack}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = {1997}, address = {Nagoya, JP}, pages = {770--775}, url = {http://www.cs.cmu.edu/afs/cs/user/dayne/www/ps/ijcai97.ps.Z}, abstract = {We describe WebWatcher as a tour guide agent for the web, the learning algorithms used by WebWatcher, experimental results based on learning from thousands of users, and lessons learned from this case study of tour guide agents.}, } @inProceedings{Joachims98, author = {Thorsten Joachims}, title = {Text categorization with support vector machines: learning with many relevant features}, booktitle = {Proceedings of ECML-98, 10th European Conference on Machine Learning}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1398}, editor = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol}, address = {Chemnitz, DE}, pages = {137--142}, year = {1998}, url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_98a.ps.gz}, abstract = {The paper explores the use of Support Vector Machines (SVMs) for learning text classifiers from examples. It analyzes the particular properties of learning with text data and identifies why SVMs are appropriate for this task. Empirical results support the theoretical findings. SVMs achieve substantial improvements over the currently best performing methods and behave robustly over a variety of different learning tasks. Furthermore, they are fully automatic, eliminating the need for manual parameter tuning.}, } @inProceedings{Joachims99, author = {Thorsten Joachims}, title = {Transductive Inference for Text Classification using Support Vector Machines}, booktitle = {Proceedings of ICML-99, 16th International Conference on Machine Learning}, editor = {Ivan Bratko and Saso Dzeroski}, year = {1999}, address = {Bled, SL}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, pages = {200--209}, url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_99c.ps.gz}, abstract = {This paper introduces transductive support vector machines (TSVMs) for text classification. While regular support vector machines (SVMs) try to induce a general decision function for a learning task, TSVMs take into account a particular test set and try to minimize misclassifications of just those particular examples. The paper presents an analysis of why TSVMs are well suited for text classification. These theoretical findings are supported by experiments on three test collections. The experiments show substantial improvements over inductive methods, especially for small training sets, cutting the number of labeled training examples down to a 20th on some tasks. This work also proposes an algorithm for training TSVMs efficiently, handling 10,000 examples and more.}, } @article{Juan02, author = {Juan, Alfons and Vidal, Enrique}, title = {On the use of Bernoulli mixture models for text classification}, journal = {Pattern Recognition}, year = {2002}, volume = {35}, number = {12}, pages = {2705--2710}, url = {}, abstract = {Mixture modelling of class-conditional densities is a standard pattern recognition technique. Although most research on mixture models has concentrated on mixtures for continuous data, emerging pattern recognition applications demand extending research efforts to other data types. This paper focuses on the application of mixtures of multivariate Bernoulli distributions to binary data. More concretely, a text classification task aimed at improving language modelling for machine translation is considered.}, } @inProceedings{Junker00, author = {Markus Junker and Michaell Sintek and Matthias Rinck}, title = {Learning for text categorization and information extraction with ILP}, booktitle = {Proceedings of the 1st Workshop on Learning Language in Logic}, editor = {Cussens, James and Saso Dzeroski}, year = {2000}, address = {Bled, SL}, pages = {247--258}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1925}, url = {}, abstract = {Text categorization (TC) and information extraction (IE) are two important goals of natural language processing. While hand-crafting rules for both tasks has a long tradition, learning approaches used to gain much interest in the past. Since in both tasks text as a sequence of words is of crucial importance, propositional learners have strong limitations, Although viewing learning for TC and IE as inductive logic programming (ILP) problems is obvious, most approaches rather use proprietary formalisms. In this paper, we provide a solid basis for the application of ILP methods to these learning problems. We introduce three basic types (namely a type for text, one for words and one for positions in texts) and three simple predicate definitions over these types which enable us to write TC and IE rules as logic programs. Based on the proposed representation, we present an approach to the problem of learning rules for TC and IE in terms of ILP. We conclude by comparing our approach of representing texts and rules as logic programs to others.}, } @inProceedings{Junker01, author = {Markus Junker and Andreas Dengel}, title = {Preventing Overfitting in Learning Text Patterns for Document Categorization}, booktitle = {Proceedings of ICAPR-01, 2nd International Conference on Advances in Pattern Recognition}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2013}, editor = {Sameer Singh and Nabeel A. Murshed and Walter Kropatsch}, address = {Rio De Janeiro, BR}, year = {2001}, pages = {137--146}, url = {http://link.springer.de/link/service/series/0558/papers/2013/20130137.pdf}, abstract = {There is an increasing interest in categorizing texts using learning algorithms. While the majority of approaches rely on learning linear classifiers, there is also some interest in describing document categories by text patterns. We introduce a model for learning patterns for text categorization (the LPT-model) that does not rely on an attribute-value representation of documents but represents documents essentially "as they are". Based on the LPT-model, we focus on learning patterns within a relatively simple pattern language. We compare different search heuristics and pruning methods known from various symbolic rule learners on a set of representative text categorization problems. The best results were obtained using the m-estimate as search heuristics combined with the likelihood-ratio-statics for pruning. Even better results can be obtained, when replacing the likelihood-ratio-statics by a new measure for pruning; this we call l-measure. In contrast to conventional measures for pruning, the l-measure takes into account properties of the search space.}, } @inProceedings{Junker97, author = {Markus Junker and Andreas Abecker}, title = {Exploiting Thesaurus Knowledge in Rule Induction for Text Classification}, booktitle = {Proceedings of RANLP-97, 2nd International Conference on Recent Advances in Natural Language Processing}, publisher = {}, editor = {Ruslan Milkov and Nicolas Nicolov and Nilokai Nikolov}, address = {Tzigov Chark, BL}, pages = {202--207}, year = {1997}, url = {http://www.dfki.uni-kl.de/~junker/download/ranlp97.ps}, abstract = {Systems for learning text classifiers recently gained considerable interest. One technique to implement such systems is rule induction. While most other approaches rely on a relatively simple document representation and do not make use of any background knowledge, rule induction algorithms offer a good potential for improvements in both of these areas. In this paper, we show how an operator-based view of rule induction enables the easy integration of a thesaurus as background knowledge. Results with an algorithm extended by thesaurus knowledge are presented and interpreted. The interpretation shows the strengths and weaknesses of using thesaurus knowledge and gives hints for future research.}, } @article{Junker98, author = {Markus Junker and Rainer Hoch}, title = {An experimental evaluation of OCR text representations for learning document classifiers}, journal = {International Journal on Document Analysis and Recognition}, pages = {116--122}, year = {1998}, number = {2}, volume = {1}, url = {http://link.springer.de/link/service/journals/10032/papers/8001002/80010116.ps.gz}, abstract = {In the literature, many feature types are proposed for document classification. However, an extensive and systematic evaluation of the various approaches has not yet been done. In particular, evaluations on OCR documents are very rare. In this paper we investigate seven text representations based on n-grams and single words. We compare their effectiveness in classifying OCR texts and the corresponding correct ASCII texts in two domains: business letters and abstracts of technical reports. Our results indicate that the use of n-grams is an attractive technique which can even compare to techniques relying on a morphological analysis. This holds for OCR texts as well as for correct ASCII texts.}, } @article{Kaban02, author = {Ata Kaban and Mark Girolami}, title = {A Dynamic Probabilistic Model to Visualise Topic Evolution in Text Streams}, journal = {Journal of Intelligent Information Systems}, year = {2002}, note = {Special Issue on Automated Text Categorization}, volume = {18}, number = {2/3}, pages = {107--125}, url = {http://www.wkap.nl/article.pdf?391242}, abstract = {We propose a novel probabilistic method, based on latent variable models, for unsupervised topographic visualisation of dynamically evolving, coherent textual information. This can be seen as a complementary tool for topic detection and tracking applications. This is achieved by the exploitation of the a priori domain knowledge available, that there are relatively homogeneous temporal segments in the data stream. In a different manner from topographical techniques previously utilized for static text collections, the topography is an outcome of the coherence in time of the data stream in the proposed model. Simulation results on both toy-data settings and an actual application on Internet chat line discussion analysis is presented by way of demonstration.}, } @inProceedings{Kao03, author = {Anne Kao and Lesley Quach and Steve Poteet and Steve Woods}, title = {User assisted text classification and knowledge management}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {524--527}, url = {http://doi.acm.org/10.1145/956863.956964}, abstract = {While there are many aspects to managing corporate knowledge, one key issue is how to organize corporate documents into categories of interest. In this paper, we focus on using user assisted text classification in conjunction with a web portal, multiple document management systems and an ontology, to provide a powerful solution for organizing information about a company's technology. We propose a system that interacts with an author using an automatic text classifier to suggest controlled keywords to be used as metadata. The proposed approach does not require professional librarians or that the end users have extensive training. The use of a controlled vocabulary allows for a more consistent description of corporate documents, and promotes easier access by people across the company. It is easier to find similar documents which use different nomenclature. Finally, the interactive nature of the system results in a more correct and precise description of each document than a fully automatic system would.}, } @article{Kar78, author = {Gautam Kar and Lee J. White}, title = {A distance measure for automated document classification by sequential analysis}, journal = {Information Processing and Management}, pages = {57--69}, year = {1978}, number = {2}, volume = {14}, url = {}, abstract = {}, } @inProceedings{Karypis00, author = {George Karypis and Eui-Hong Han}, title = {Fast Supervised Dimensionality Reduction Algorithm with Applications to Document Categorization and Retrieval}, booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, address = {McLean, US}, editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner}, year = {2000}, pages = {12--19}, url = {ftp://ftp.cs.umn.edu/dept/users/kumar/cikm-ci.ps}, abstract = {Retrieval techniques based on dimensionality reduction, such as Latent Semantic Indexing (LSI), have been shown to improve the quality of the information being retrieved by capturing the latent meaning of the words present in the documents. Unfortunately, the high computational and memory requirements of LSI and its inability to compute an effective dimensionality reduction in a supervised setting limits its applicability. In this paper we present a fast supervised dimensionality reduction algorithm that is derived from the recently developed cluster-based unsupervised dimensionality reduction algorithms. We experimentally evaluate the quality of the lower dimensional spaces both in the context of document categorization and improvements in retrieval performance on a variety of different document collections. Our experiments show that the lower dimensional spaces computed by our algorithm consistently improve the performance of traditional algorithms such as C4.5, k-nearest-neighbor, and Support Vector Machines (SVM), by an average of 2\% to 7\%. Furthermore, the supervised lower dimensional space greatly improves the retrieval performance when compared to LSI.}, } @inProceedings{Kawatani02, author = {Takahiko Kawatani}, title = {Topic Difference Factor Extraction between Two Document Sets and its Application to Text Categorization}, booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on Research and Development in Information Retrieval}, editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng and Kalervo J{\"{a}}rvelin}, publisher = {ACM Press, New York, US}, address = {Tampere, FI}, year = {2002}, pages = {137--144}, url = {http://doi.acm.org/10.1145/564376.564402}, abstract = {To improve performance in text categorization, it is important to extract distinctive features for each class. This paper proposes topic difference factor analysis (TDFA) as a method to extract projection axes that reflect topic differences between two document sets. Suppose all sentence vectors that compose each document are projected onto projection axes. TDFA obtains the axes that maximize the ratio between the document sets as to the sum of squared projections by solving a generalized eigenvalue problem. The axes are called topic difference factors (TDF's). By applying TDFA to the document set that belongs to a given class and a set of documents that is misclassified as belonging to that class by an existent classifier, we can obtain features that take large values in the given class but small ones in other classes, as well as features that take large values in other classes but small ones in the given class. A classifier was constructed applying the above features to complement the kNN classifier. As the results, the micro averaged F1 measure for Reuters-21578 improved from 83.69 to 87.27\%.}, } @article{Kehagias03, title = {A Comparison of Word- and Sense-based Text Categorization Using Several Classification Algorithms}, author = {Athanasios Kehagias and Vassilios Petridis and Vassilis G. Kaburlasos and Pavlina Fragkou}, journal = {Journal of Intelligent Information Systems}, year = {2003}, volume = {21}, number = {3}, pages = {227--247}, url = {http://www.wkap.nl/article.pdf?391243}, abstract = {Most of the text categorization algorithms in the literature represent documents as collections of words. An alternative which has not been sufficiently explored is the use of word meanings, also known as senses. In this paper, using several algorithms, we compare the categorization accuracy of classifiers based on words to that of classifiers based on senses. The document collection on which this comparison takes place is a subset of the annotated Brown Corpus semantic concordance. A series of experiments indicates that the use of senses does not result in any significant categorization improvement.}, } @inProceedings{Kessler97, author = {Brett Kessler and Geoff Nunberg and Hinrich Sch{\"{u}}tze}, title = {Automatic detection of text genre}, booktitle = {Proceedings of ACL-97, 35th Annual Meeting of the Association for Computational Linguistics}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, editor = {Philip R. Cohen and Wolfgang Wahlster}, year = {1997}, address = {Madrid, ES}, pages = {32--38}, url = {ftp://parcftp.xerox.com/pub/qca/genre/paper.acl97.ps.Z}, abstract = {As the text databases available to users become larger and more heterogeneous, genre becomes increasingly important for computational linguistics as a complement to topical and structural principles of classification. We propose a theory of genres as bundles of facets, which correlate with various surface cues, and argue that genre detection based on surface cues is as successful as detection based on deeper structural properties.}, } @inProceedings{Khmelev03, author = {Dmitry V. Khmelev and William J. Teahan}, title = {A repetition based measure for verification of text collections and for text categorization}, booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on Research and Development in Information Retrieval}, editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David Hawking and Alan Smeaton}, publisher = {ACM Press, New York, US}, address = {Toronto, CA}, year = {2003}, pages = {104--110}, url = {http://doi.acm.org/10.1145/860435.860456}, abstract = {We suggest a way for locating duplicates and plagiarisms in a text collection using an R-measure, which is the normalized sum of the lengths of all suffixes of the text repeated in other documents of the collection. The R-measure can be effectively computed using the suffix array data structure. Additionally, the computation procedure can be improved to locate the sets of duplicate or plagiarised documents. We applied the technique to several standard text collections and found that they contained a significant number of duplicate and plagiarised documents. Another reformulation of the method leads to an algorithm that can be applied to supervised multi-class categorization. We illustrate the approach using the recently available Reuters Corpus Volume 1 (RCV1). The results show that the method outperforms SVM at multi-class categorization, and interestingly, that results correlate strongly with compression-based methods.}, } @inProceedings{Kim00, author = {Yu-Hwan Kim and Shang-Yoon Hahn and Byoung-Tak Zhang}, title = {Text filtering by boosting naive Bayes classifiers}, booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong}, publisher = {ACM Press, New York, US}, address = {Athens, GR}, year = {2000}, pages = {168--175}, url = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p168-kim/p168-kim.pdf}, abstract = {Several machine learning algorithms have recently been used for text categorization and filtering. In particular, boosting methods such as AdaBoost have shown good performance applied to real text data. However, most of existing boosting algorithms are based on classifiers that use binary-valued features. Thus, they do not fully make use of the weight information provided by standard term weighting methods. In this paper, we present a boosting-based learning method for text filtering that uses naive Bayes classifiers as a weak learner. The use of naive Bayes allows the boosting algorithm to utilize term frequency information while maintaining probabilistically accurate confidence ratio. Applied to TREC-7 and TREC-8 filtering track documents, the proposed method obtained a significant improvement in LF1, LF2, Fl and F3 measures compared to the best results submitted by other TREC entries.}, } @inProceedings{Kim04, author = {Sang-Bum Kim and Hae-Chang Rim}, title = {Recomputation of Class Relevance Scores for Improving Text Classification}, booktitle = {Proceedings of CICLING-04, 5th International Conference on Computational Linguistics and Intelligent Text Processing}, year = {2004}, editor = {Alexander F. Gelbukh}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Seoul, KO}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2945}, pages = {580--583}, url = {}, abstract = {}, } @article{Kim04a, author = {Kim, J. and Kim, M.}, title = {An evaluation of passage-based text categorization}, journal = {Journal of Intelligent Information Systems}, year = {2004}, volume = {23}, number = {1}, pages = {47--65}, url = {http://dx.doi.org/10.1023/B:JIIS.0000029670.53363.d0}, abstract = {Researches in text categorization have been confined to whole-document-level classification, probably due to lack of full-text test collections. However, full-length documents available today in large quantities pose renewed interests in text classification. A document is usually written in an organized structure to present its main topic(s). This structure can be expressed as a sequence of subtopic text blocks, or passages. In order to reflect the subtopic structure of a document, we propose a new passage-level or passage-based text categorization model, which segments a test document into several passages, assigns categories to each passage, and merges the passage categories to the document categories. Compared with traditional document-level categorization, two additional steps, passage splitting and category merging, are required in this model. Using four subsets of the Reuters text categorization test collection and a full-text test collection of which documents are varying from tens of kilobytes to hundreds, we evaluate the proposed model, especially the effectiveness of various passage types and the importance of passage location in category merging. Our results show simple windows are best for all test collections tested in these experiments. We also found that passages have different degrees of contribution to the main topic(s), depending on their location in the test document.}, } @inProceedings{Kindermann01, author = {J{\"{o}}rg Kindermann and Gerhard Paa{{\ss}} and Edda Leopold}, title = {Error Correcting Codes with Optimized Kullback-Leibler Distances for Text Categorization}, booktitle = {Proceedings of ECML-01, 12th European Conference on Machine Learning}, editor = {Luc De Raedt and Arno Siebes}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Freiburg, DE}, year = {2001}, pages = {266--275}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2168}, url = {http://link.springer.de/link/service/series/0558/papers/2168/21680266.pdf}, abstract = {We extend a multi-class categorization scheme proposed by Dietterich and Bakiri 1995 for binary classifiers, using error correcting codes. The extension comprises the computation of the codes by a simulated annealing algorithm and optimization of Kullback-Leibler (KL) category distances within the code-words. For the first time, we apply the scheme to text categorization with support vector machines (SVMs) on several large text corpora with more than 100 categories. The results are compared to 1-of-N coding (i.e.\ one SVM for each text category). We also investigate codes with optimized KL distance between the text categories which are merged in the code-words. We find that error correcting codes perform better than 1-of-N coding with increasing code length. For very long codes, the performance is in some cases further improved by KL-distance optimization.}, } @inProceedings{Klas00, author = {Klas, Claus-Peter and Fuhr, Norbert}, title = {A new Effective Approach for Categorizing Web Documents}, booktitle = {Proceedings of BCSIRSG-00, the 22nd Annual Colloquium of the British Computer Society Information Retrieval Specialist Group}, editor = {}, address = {Cambridge, UK}, year = {2000}, pages = {}, publisher = {}, url = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Klas_Fuhr:00.ps.gz}, abstract = {Categorization of Web documents poses a new challenge for automatic classification methods. In this paper, we present the megadocument approach for categorization. For each category, all corresponding document texts from the training sample are concatenated to a megadocument, which is indexed using standard methods. In order to classify a new document, the most similar megadocument determines the category to be assigned. Our evaluations show that for Web collections, the megadocument method clearly outperformes other classification methods. In contrast, for the Reuters collection, we only achieve mediocre results. Thus, our method seems to be well suited for heterogeneous document collections.}, } @article{Klingbiel73, author = {Paul H. Klingbiel}, title = {Machine-aided indexing of technical literature}, journal = {Information Storage and Retrieval}, year = {1973}, volume = {9}, number = {2}, pages = {79--84}, url = {}, abstract = {To index successfully in the Defense Documentation Center's environment, an automated system must chose single words or phrases (dependent upon context) rapidly and economically. The automation of DDC's indexing has been machine-aided from its inception. A machine-aided indexing (MAI) system is described that indexes one million words of text per hour of CPU time. Grammatical errors do not exceed five per cent of the output, so human screening is satisfactorily low. The system could potentially scale up to an operational size of 10 million words of text per year - the equivalent of a dozen bibles or a third of the Encyclopedia Britannica. In a batch mode, the programs to accomplish this indexing would require no more than fifteen minutes of CPU time per week.}, } @article{Klingbiel73a, author = {Paul H. Klingbiel}, title = {A technique for machine-aided indexing}, journal = {Information Storage and Retrieval}, year = {1973}, volume = {9}, number = {9}, pages = {477--494}, url = {}, abstract = {Subject indexing of text can, in principle, be accomplished in many ways. The technique for machine-aided indexing (MAI) developed at the Defense Documentation Center (DDC) is illustrated on a randomly chosen abstract. Additional text is provided in coded form so that the reader can more fully explore this technique and form his own opinion of the applicability and versatility of this particular procedure. The DDC method for subject indexing is very close to operational status for a data base which grows at the rate of two million words of text per year.}, } @inProceedings{Klinkenberg00, author = {Ralf Klinkenberg and Thorsten Joachims}, title = {Detecting concept drift with support vector machines}, booktitle = {Proceedings of ICML-00, 17th International Conference on Machine Learning}, editor = {Pat Langley}, year = {2000}, address = {Stanford, US}, pages = {487--494}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/klinkenberg_joachims_2000a.pdf.gz}, abstract = {For many learning tasks where data is collected over an extended period of time, its underlying distribution is likely to change. A typical example is information filtering, i.e. the adaptive classification of documents with respect to a particular user interest. Both the interest of the user and the document content change over time. A filtering system should be able to adapt to such concept changes. This paper proposes a new method to recognize and handle concept changes with support vector machines. The method maintains a window on the training data. The key idea is to automatically adjust the window size so that the estimated generalization error is minimized. The new approach is both theoretically well-founded as well as effective and efficient in practice. Since it does not require complicated parameterization, it is simpler to use and more robust than comparable heuristics. Experiments with simulated concept drift scenarios based on real-world text data compare the new method with other window management approaches. We show that it can effectively select an appropriate window size in a robust way.}, } @inProceedings{Knorz82, author = {Knorz, Gerhard}, title = {A decision theory approach to optimal automated indexing}, booktitle = {Proceedings of SIGIR-82, 5th ACM International Conference on Research and Development in Information Retrieval}, year = {1982}, editor = {Gerard Salton and Hans-Jochen Schneider}, pages = {174--193}, address = {Berlin, DE}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 146}, url = {}, abstract = {}, } @inProceedings{Ko00, author = {Youngjoong Ko and Jungyun Seo}, title = {Automatic Text Categorization by Unsupervised Learning}, booktitle = {Proceedings of COLING-00, the 18th International Conference on Computational Linguistics}, year = {2000}, editor = {}, pages = {}, address = {Saarbr{\"{u}}cken, DE}, url = {http://nlp3.korea.ac.kr/proceeding/coling2000/COLING/ps/066.ps}, abstract = {The goal of text categorization is to classify documents into a certain number of pre-defined categories. The previous works in this area have used a large number of labeled training documents for supervised learning. One problem is that it is difficult to create the labeled training documents. While it is easy to collect the unlabeled documents, it is not so easy to manually categorize them for creating training documents. In this paper, we propose an unsupervised learning method to overcome these difficulties. The proposed method divides the documents into sentences, and categorizes each sentence using keyword lists of each category and sentence similarity measure. And then, it uses the categorized sentences for training. The proposed method shows a similar degree of performance, compared with the traditional supervised learning methods. Therefore, this method can be used in areas where low-cost text categorization is needed. It also can be used for creating training documents.}, } @inProceedings{Ko02, author = {Youngjoong Ko and Jinwoo Park and Jungyun Seo}, title = {Automatic Text Categorization using the Importance of Sentences}, booktitle = {Proceedings of COLING-02, the 19th International Conference on Computational Linguistics}, year = {2002}, editor = {}, pages = {}, address = {Taipei, TW}, url = {http://acl.ldc.upenn.edu/coling2002/proceedings/data/area-28/co-201.pdf}, abstract = {This paper proposes a new approach for text categorization, based on a feature projection technique. In our approach, training data are represented as the projections of training documents on each feature. The voting for a classification is processed on the basis of individual feature projections. The final classification of test documents is determined by a majority voting from the individual classifications of each feature. Our empirical results show that the proposed approach, Text Categorization using Feature Projections (TCFP), outperforms k-NN, Rocchio, and Naive Bayes. Most of all, TCFP is about one hundred times faster than k-NN. Since TCFP algorithm is very simple, its implementation and training process can be done very easily. For these reasons, TCFP can be a useful classifier in the areas, which need a fast and high-performance text categorization task.}, } @inProceedings{Ko02a, author = {Youngjoong Ko and Jungyun Seo}, title = {Text Categorization using Feature Projections}, booktitle = {Proceedings of COLING-02, the 19th International Conference on Computational Linguistics}, year = {2002}, editor = {}, pages = {}, address = {Taipei, TW}, url = {http://acl.ldc.upenn.edu/coling2002/proceedings/data/area-28/co-269.pdf}, abstract = {Automatic text categorization is a problem of automatically assigning text documents to predefined categories. In order to classify text documents, we must extract good features from them. In previous research, a text document is commonly represented by the term frequency and the inverted document frequency of each feature. Since there is a difference between important sentences and unimportant sentences in a document, the features from more important sentences should be considered more than other features. In this paper, we measure the importance of sentences using text summarization techniques. Then a document is represented as a vector of features with different weights according to the importance of each sentence. To verify our new method, we conducted experiments on two language newsgroup data sets: one written by English and the other written by Korean. Four kinds of classifiers were used in our experiments: Naive Bayes, Rocchio, k-NN, and SVM. We observed that our new method made a significant improvement in all classifiers and both data sets.}, } @article{Ko04, author = {Youngjoong Ko and Jinwoo Park and Jungyun Seo}, title = {Improving Text Categorization using the Importance of Sentences}, journal = {Information Processing and Management}, year = {2004}, volume = {40}, number = {1}, pages = {65--79}, url = {}, abstract = {}, } @article{Ko04a, author = {Youngjoong Ko and Jinwoo Park and Jungyun Seo}, title = {Using the feature projection technique based on a normalized voting method for text classification}, journal = {Information Processing and Management}, year = {2004}, volume = {40}, number = {2}, pages = {191--208}, url = {}, abstract = {}, } @inProceedings{Koehn02, author = {Philipp Koehn}, title = {Combining Multiclass Maximum Entropy Text Classifiers with Neural Network Voting}, booktitle = {Proceedings of PorTAL-02, 3rd International Conference on Advances in Natural Language Processing}, year = {2002}, editor = {Elisabete Ranchod and Nuno J. Mamede}, pages = {125--132}, address = {Faro, PT}, url = {http://link.springer.de/link/service/series/0558/papers/2389/23890125.pdf}, abstract = {We improve a high-accuracy maximum entropy classifier by combining an ensemble of classifiers with neural network voting. In our experiments we demonstrate significantly superior performance both over a single classifier as well as over the use of the traditional weighted-sum voting approach. Specifically, we apply this to a maximum entropy classifier on a large scale multi-class text categorization task: the online job directory Flipdog with over half a million jobs in 65 categories.}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2389}, } @inProceedings{Kolcz01, author = {Aleksander Kolcz and Vidya Prabakarmurthi and Jugal K. Kalita}, title = {String Match and Text Extraction: Summarization as feature selection for text categorization}, booktitle = {Proceedings of CIKM-01, 10th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Henrique Paques and Ling Liu and David Grossman}, year = {2001}, address = {Atlanta, US}, pages = {365--370}, url = {http://doi.acm.org/10.1145/502585.502647}, abstract = {We address the problem of evaluating the effectiveness of summarization techniques for the task of document categorization. It is argued that for a large class of automatic categorization algorithms, extraction-based document categorization can be viewed as a particular form of feature selection performed on the full text of the document and, in this context, its impact can be compared with state-of-the-art feature selection techniques especially devised to provide good categorization performance. Such a framework provides for a better assessment of the expected performance of a categorizer if the compression rate of the summarizer is known.}, } @inProceedings{Koller97, author = {Daphne Koller and Mehran Sahami}, title = {Hierarchically classifying documents using very few words}, booktitle = {Proceedings of ICML-97, 14th International Conference on Machine Learning}, editor = {Douglas H. Fisher}, year = {1997}, address = {Nashville, US}, pages = {170--178}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://robotics.stanford.edu/users/sahami/papers-dir/ml97-hier.ps}, abstract = {The proliferation of topic hierarchies for text documents has resulted in a need for tools that automatically classify new documents within such hierarchies. Existing classification schemes which ignore the hierarchical structure and treat the topics as separate classes are often inadequate in text classification where the there is a large number of classes and a huge number of relevant features needed to distinguish between them. We propose an approach that utilizes the hierarchical topic structure to decompose the classification task into a set of simpler problems, one at each node in the classification tree. As we show, each of these smaller problems can be solved accurately by focusing only on a very small set of features, those relevant to the task at hand. This set of relevant features varies widely throughout the hierarchy, so that, while the overall relevant feature set may be large, each classifier only examines a small subset. The use of reduced feature sets allows us to utilize more complex (probabilistic) models, without encountering many of the standard computational and robustness difficulties.}, } @inProceedings{Kongovi02, author = {Madhusudhan Kongovi and Juan Carlos Guzman and Venu Dasigi}, title = {Text Categorization: An experiment using Phrases}, booktitle = {Proceedings of ECIR-02, 24th European Colloquium on Information Retrieval Research}, editor = {Fabio Crestani and Mark Girolami and Van Rijsbergen, Cornelis J.}, year = {2002}, address = {Glasgow, UK}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2291}, pages = {213--228}, url = {http://link.springer.de/link/service/series/0558/papers/2291/22910213.pdf}, abstract = {Typical text classifiers learn from example and training documents that have been manually categorized. In this research, our experiment dealt with the classification of news wire articles using category profiles. We built these profiles by selecting feature words and phrases from the training documents. For our experiments we decided on using the text corpus Reuters-21578. We used precision and recall to measure the effectiveness of our classifier. Though our experiments with words yielded good results, we found instances where the phrase-based approach produced more effectiveness. This could be due to the fact that when a word along with its adjoining word - a phrase - is considered towards building a category profile, it could be a good discriminator. This tight packaging of word pairs could bring in some semantic value. The packing of word pairs also filters out words occurring frequently in isolation that do not bear much weight towards characterizing that category.}, } @article{Koppel02, author = {Koppel, Moshe and Argamon, Shlomo and Shimoni, Anat R.}, title = {Automatically categorizing written texts by author gender}, journal = {Literary and Linguistic Computing}, year = {2002}, number = {4}, volume = {17}, pages = {401--412}, url = {http://www3.oup.co.uk/litlin/hdb/Volume_17/Issue_04/pdf/170401.pdf}, abstract = {The problem of automatically determining the gender of a document's author would appear to be a more subtle problem than those of categorization by topic or authorship attribution. Nevertheless, it is shown that automated text categorization techniques can exploit combinations of simple lexical and syntactic features to infer the gender of the author of an unseen formal written document with approximately 80 per cent accuracy. The same techniques can be used to determine if a document is fiction or non-fiction with approximately 98 per cent accuracy.}, } @inProceedings{Kosmynin96, author = {Arkadi Kosmynin and Ian Davidson}, title = {Using background contextual knowledge for documents representation}, booktitle = {Proceedings of PODP-96, 3rd International Workshop on Principles of Document Processing}, editor = {Charles K. Nicholas and Derick Wood}, year = {1996}, address = {Palo Alto, CA}, pages = {123--133}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1293}, url = {}, abstract = {We describe our approach to document representation that captures contextual dependencies between terms in a corpus and makes use of these dependencies to represent documents. We have tried our representation scheme for automatic document categorisation on the Reuters' test set of documents. We achieve a precision recall break even point of 84\% which is comparable to the best known published results. Our approach acts as a feature selection technique that is an alternative to applying the techniques from machine learning and numerical taxonomy.}, } @inProceedings{Koster03, author = {Cornelis H. Koster and Mark Seutter}, title = {Taming wild phrases}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag, Heidelberg, DE}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {161--176}, url = {http://www.cs.kun.nl/~kees/peking/ecir03.pdf}, abstract = {In this paper the suitability of different document representations for automatic document classification is compared, investigating a whole range of representations between bag-of-words and bag-of-phrases. We look at some of their statistical properties, and determine for each representation the optimal choice of classification parameters and the effect of Term Selection. Phrases are represented by an abstraction called Head/Modifier pairs. Rather than just throwing phrases and keywords together, we shall start with pure HM pairs and gradually add more keywords to the document representation. We use the classification on keywords as the baseline, which we compare with the contribution of the pure HM pairs to classification accuracy, and the incremental contributions from heads and modifiers. Finally, we measure the accuracy achieved with all words and all HM pairs combined, which turns out to be only marginally above the baseline. We conclude that even the most careful term selection cannot overcome the differences in Document Frequency between phrases and words, and propose the use of term clustering to make phrases more cooperative.}, } @article{Krier02, author = {Marc Krier and Francesco Zacc{\`a}}, title = {Automatic categorization applications at the European Patent Office}, journal = {World Patent Information}, year = {2002}, volume = {24}, number = {}, pages = {187--196}, url = {}, abstract = {}, } @inProceedings{Krishnapuram03, author = {Raghu Krishnapuram and Krishna Chitrapura and Sachindra Joshi}, title = {Classification of Text Documents Based on Minimum System Entropy}, booktitle = {Proceedings of ICML-03, 20th International Conference on Machine Learning}, editor = {}, year = {2003}, address = {Washington, DC}, pages = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {}, abstract = {}, } @inProceedings{Kumaran04, author = {Giridhar Kumaran and James Allan}, title = {Text classification and named entities for new event detection}, booktitle = {Proceedings of SIGIR-04, 27th ACM International Conference on Research and Development in Information Retrieval}, editor = {Kalervo J{\"{a}}rvelin and James Allan and Peter Bruza and Mark Sanderson}, publisher = {ACM Press, New York, US}, address = {Sheffield, UK}, year = {2004}, pages = {297--304}, url = {http://doi.acm.org/10.1145/1008992.1009044}, abstract = {New Event Detection is a challenging task that still offers scope for great improvement after years of effort. In this paper we show how performance on New Event Detection (NED) can be improved by the use of text classification techniques as well as by using named entities in a new way. We explore modifications to the document representation in a vector space-based NED system. We also show that addressing named entities preferentially is useful only in certain situations. A combination of all the above results in a multi-stage NED system that performs much better than baseline single-stage NED systems.}, } @inProceedings{Kwok98, author = {James T. Kwok}, title = {Automated text categorization using support vector machine}, booktitle = {Proceedings of ICONIP'98, 5th International Conference on Neural Information Processing}, editor = {}, year = {1998}, address = {Kitakyushu, JP}, pages = {347--351}, url = {http://www.comp.hkbu.edu.hk/7Ejamesk/papers/iconip98.ps.gz}, abstract = {In this paper, we study the use of support vector machine in text categorization. Unlike other machine learning techniques, it allows easy incorporation of new documents into an existing trained system. Moreover, dimension reduction, which is usually imperative, now becomes optional. Thus, SVM adapts efficiently in dynamic environments that require frequent additions to the document collection. Empirical results on the Reuters-22173 collection are also discussed.}, } @article{Kwon03, author = {Oh-Woog Kwon and Jong-Hyeok Lee}, title = {Text categorization based on {k}-nearest neighbor approach for Web site classification}, journal = {Information Processing and Management}, year = {2003}, volume = {39}, number = {1}, pages = {25--44}, url = {}, abstract = {}, } @inProceedings{Kwon99, author = {Oh-Woog Kwon and Sung-Hwa Jung and Jong-Hyeok Lee and Geunbae Lee}, title = {Evaluation of Category Features and Text Structural Information on a Text Categorization Using Memory Based Reasoning}, booktitle = {Proceedings of ICCPOL-99, 18th International Conference on Computer Processing of Oriental Languages}, editor = {}, year = {1999}, address = {Tokushima, JP}, pages = {153--158}, url = {}, abstract = {}, } @inProceedings{Labrou99, author = {Yannis Labrou and Tim Finin}, title = {{{\sc Yahoo!}} as an ontology: using {{\sc Yahoo!}}\ categories to describe documents}, booktitle = {Proceedings of CIKM-99, 8th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {1999}, address = {Kansas City, US}, pages = {180--187}, url = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p180-labrou/p180-labrou.pdf}, abstract = {We suggest that one (or a collection) of names of {{\sc Yahoo!}}\ (or any other WWW indexer's) categories can be used to describe the content of a document. Such categories offer a standardized and universal way for referring to or describing the nature of real world objects, activities, documents and so on, and may be used (we suggest) to semantically characterize the content of documents. WWW indices, like {{\sc Yahoo!}}\ provide a huge hierarchy of categories (topics) that touch every aspect of human endeavors. Such topics can be used as descriptors, similarly to the way librarians use for example, the Library of Congress cataloging system to annotate and categorize books. In the course of investigating this idea, we address the problem of automatic categorization of webpages in the {{\sc Yahoo!}}\ directory. We use Telltale as our classifier; Telltale uses n-grams to compute the similarity between documents. We experiment with various types of descriptions for the {{\sc Yahoo!}}\ categories and the webpages to be categorized. Our findings suggest that the best results occur when using the very brief descriptions of the {{\sc Yahoo!}}\ categorized entries; these brief descriptions are provided either by the entries' submitters or by the {{\sc Yahoo!}}\ human indexers and accompany most {{\sc Yahoo!}}\-indexed entries.}, } @inProceedings{Lai01, author = {Kwok-Yin Lai and Wai Lam}, title = {Meta-learning Models for Automatic Textual Document Categorization}, booktitle = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on Knowledge Discovery and Data Mining}, editor = {David Cheung and Qing Li and Graham Williams}, year = {2001}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Hong Kong, CN}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2035}, pages = {78--89}, url = {http://link.springer.de/link/service/series/0558/papers/2035/20350078.pdf}, abstract = {We investigate two meta-model approaches for the task of automatic textual document categorization. The first approach is the linear combination approach. Based on the idea of distilling the characteristics of how we estimate the merits of each component algorithm, we propose three different strategies for the linear combination approach. The linear combination approach makes use of limited knowledge in the training document set. To address this limitation, we propose the second meta-model approach, called Meta-learning Using Document Feature characteristics (MUDOF), which employs a meta-learning phase using document feature characteristics. Document feature characteristics, derived from the training document set, capture some inherent properties of a particular category. Extensive experiments have been conducted on a real-world document collection and satisfactory performance is obtained.}, } @article{Lai02, author = {Yu-Sheng Lai and Chung-Hsien Wu}, title = {Meaningful term extraction and discriminative term selection in text categorization via unknown-word methodology}, journal = {ACM Transactions on Asian Language Information Processing}, year = {2002}, number = {1}, volume = {1}, pages = {34--64}, url = {http://doi.acm.org/10.1145/595576.595579}, abstract = {In this article, an approach based on unknown words is proposed for meaningful term extraction and discriminative term selection in text categorization. For meaningful term extraction, a phrase-like unit (PLU)-based likelihood ratio is proposed to estimate the likelihood that a word sequence is an unknown word. On the other hand, a discriminative measure is proposed for term selection and is combined with the PLU-based likelihood ratio to determine the text category. We conducted several experiments on a news corpus, called MSDN. The MSDN corpus is collected from an online news Website maintained by the Min-Sheng Daily News, Taiwan. The corpus contains 44,675 articles with over 35 million words. The experimental results show that the system using a simple classifier achieved 95.31\% accuracy. When using a state-of-the-art classifier, kNN, the average accuracy is 96.40\%, outperforming all the other systems evaluated on the same collection, including the traditional term-word by kNN (88.52\%); sleeping-experts (82.22\%); sparse phrase by four-word sleeping-experts (86.34\%); and Boolean combinations of words by RIPPER (87.54\%). A proposed purification process can effectively reduce the dimensionality of the feature space from 50,576 terms in the word-based approach to 19,865 terms in the unknown word-based approach. In addition, more than 80\% of automatically extracted terms are meaningful. Experiments also show that the proportion of meaningful terms extracted from training data is relative to the classification accuracy in outside testing.}, } @inProceedings{Lam01, author = {Wai Lam and Kwok-Yin Lai}, title = {A Meta-Learning Approach for Text Categorization}, booktitle = {Proceedings of SIGIR-01, 24th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin Zobel}, publisher = {ACM Press, New York, US}, address = {New Orleans, US}, year = {2001}, pages = {303--309}, url = {http://portal.acm.org/citation.cfm?doid=383952.384011}, abstract = {We investigate a meta-model approach, called Meta-learning Using Document Feature characteristics (MUDOF), for the task of automatic textual document categorization. It employs a meta-learning phase using document feature characteristics. Document feature characteristics, derived from the training document set, capture some inherent category-specific properties of a particular category. Different from existing categorization methods, MUDOF can automatically recommend a suitable algorithm for each category based on the category-specific statistical characteristics. Hence, different algorithms may be employed for different categories. Experiments have been conducted on a real-world document collection demonstrating the effectiveness of our approach. The results confirm that our meta-model approach can exploit the advantage of its component algorithms, and demonstrate a better performance than existing algorithms.}, } @inProceedings{Lam97, author = {Wai Lam and Kon F. Low and Chao Y. Ho}, title = {Using a Bayesian Network Induction Approach for Text Categorization}, booktitle = {Proceedings of IJCAI-97, 15th International Joint Conference on Artificial Intelligence}, editor = {Martha E. Pollack}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = {1997}, address = {Nagoya, JP}, pages = {745--750}, url = {}, abstract = {We investigate Bayesian methods for automatic document categorization and develop a new approach to this problem. Our new approach is based on a Bayesian network induction which does not rely on some major assumptions found in a previous method using the Bayesian independence classifier approach. The design of the new approach as well as its justification are presented. Experiments were conducted using a large scale document collection from Reuters news articles. The results show that our approach outperformed the Bayesian independence classifier as measured by a metric that combines precision and recall measures.}, } @inProceedings{Lam98, author = {Wai Lam and Chao Y. Ho}, title = {Using a generalized instance set for automatic text categorization}, booktitle = {Proceedings of SIGIR-98, 21st ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Alistair Moffat and Van Rijsbergen, Cornelis J. and Ross Wilkinson and Justin Zobel}, publisher = {ACM Press, New York, US}, year = {1998}, address = {Melbourne, AU}, pages = {81--89}, url = {http://www.acm.org/pubs/articles/proceedings/ir/290941/p81-lam/p81-lam.pdf}, abstract = {We investigate several recent approaches for text categorization under the framework of similarity-based learning. They include two families of text categorization techniques, namely the k-nearest neighbor (k-NN) algorithm and linear classifiers. After identifying the weakness and strength of each technique, we propose a new technique known as the generalized instance set (GIS) algorithm by unifying the strengths of LNN and linear classifiers and adapting to characteristics of text categorization problems. We also explore some variants of our GIS approach. We have implemented our GIS algorithm, the ExpNet algorithm, and some linear classifiers. Extensive experiments have been conducted on two common document corpora, namely the OHSUMED collection and the Reuters-21578 collection. The results show that our new approach outperforms the latest LNN approach and linear classifiers in all experiments.}, } @inProceedings{Lam99, author = {Savio L. Lam and Dik L. Lee}, title = {Feature Reduction for Neural Network Based Text Categorization}, booktitle = {Proceedings of DASFAA-99, 6th IEEE International Conference on Database Advanced Systems for Advanced Application}, editor = {Arbee L. Chen and Frederick H. Lochovsky}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, year = {1999}, address = {Hsinchu, TW}, pages = {195--202}, url = {http://dlib.computer.org/conferen/dasfaa/0084/pdf/00840195.pdf}, abstract = {In a text categorization model using an artificial neural network as the text classifier scalability is poor if the neural network is trained using the raw feature space since textural data has a very high-dimension feature space. We proposed and compared four dimensionality reduction techniques to reduce the feature space into an input space of much lower dimension for the neural network classifier. To test the effectiveness of the proposed model, experiments were conducted using a subset of the Reuters-22173 test collection for text categorization. The results showed that the proposed model was able to achieve high categorization effectiveness as measured by precision and recall. Among the four dimensionality reduction techniques proposed, principal component analysis was found to be the most effective in reducing the dimensionality of the feature space.}, } @article{Lam99a, author = {Lam, Wai and Ruiz, Miguel E. and Srinivasan, Padmini}, title = {Automatic text categorization and its applications to text retrieval}, journal = {IEEE Transactions on Knowledge and Data Engineering}, year = {1999}, number = {6}, volume = {11}, pages = {865--879}, url = {http://www.cs.uiowa.edu/~mruiz/papers/IEEE-TKDE.ps}, abstract = {We develop an automatic text categorization approach and investigate its application to text retrieval. The categorization approach is derived from a combination of a learning paradigm known as instance-based learning and an advanced document retrieval technique known as retrieval feedback. We demonstrate the effectiveness of our categorization approach using two real-world document collections from the MEDLINE database. Next, we investigate the application of automatic categorization to text retrieval. Our experiments clearly indicate that automatic categorization improves the retrieval performance compared with no categorization. We also demonstrate that the retrieval performance using automatic categorization achieves the same retrieval quality as the performance using manual categorization. Furthermore, detailed analysis of the retrieval performance on each individual test query is provided.}, } @inProceedings{Lang95, author = {Ken Lang}, title = {{\sc NewsWeeder}: learning to filter netnews}, booktitle = {Proceedings of ICML-95, 12th International Conference on Machine Learning}, editor = {Armand Prieditis and Stuart J. Russell}, address = {Lake Tahoe, US}, pages = {331--339}, year = {1995}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {}, abstract = {}, } @inProceedings{Lanquillon00, author = {Carsten Lanquillon}, title = {Learning from Labeled and Unlabeled Documents: A Comparative Study on Semi-Supervised Text Classification}, booktitle = {Proceedings of PKDD-00, 4th European Conference on Principles of Data Mining and Knowledge Discovery}, editor = {Djamel A. Zighed and Henryk Jan Komorowski and Jan M. Zytkow}, address = {Lyon, FR}, pages = {490--497}, year = {2000}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1910}, url = {http://link.springer.de/link/service/series/0558/papers/1910/19100490.pdf}, abstract = {Supervised learning algorithms usually require large amounts of training data to learn reasonably accurate classifiers. Yet, for many text classification tasks, providing labeled training documents is expensive, while unlabeled documents are readily available in large quantities. Learning from both, labeled and unlabeled documents, in a semi-supervised framework is a promising approach to reduce the need for labeled training documents. This paper compares three commonly applied text classifiers in the light of semi-supervised learning, namely a linear support vector machine, a similarity-based tfidf and a Naive Bayes classifier. Results on a real-world text datasets show that these learners may substantially benefit from using a large amount of unlabeled documents in addition to some labeled documents.}, } @inProceedings{Larkey96, author = {Leah S. Larkey and W. Bruce Croft}, title = {Combining classifiers in text categorization}, booktitle = {Proceedings of SIGIR-96, 19th ACM International Conference on Research and Development in Information Retrieval}, editor = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and Ross Wilkinson}, publisher = {ACM Press, New York, US}, year = {1996}, address = {Z{\"{u}}rich, CH}, pages = {289--297}, url = {http://cobar.cs.umass.edu/pubfiles/1combo.ps.gz}, abstract = {Three different types of classifiers were investigated in the context of a text categorization problem in the medical domain: the automatic assignment of ICD9 codes to dictated inpatient discharge summaries. K-nearest-neighbour, relevance feedback, and Bayesian independence classifiers were applied individually and in combination. A combination of different classifiers produced better results than any single type of classifier. For this specific medical categorization problem, new query formulation and weighting methods used in the k-nearest-neighbor classifier improved performance.}, } @inProceedings{Larkey98, author = {Leah S. Larkey}, title = {Automatic essay grading using text categorization techniques}, booktitle = {Proceedings of SIGIR-98, 21st ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Alistair Moffat and Van Rijsbergen, Cornelis J. and Ross Wilkinson and Justin Zobel}, publisher = {ACM Press, New York, US}, year = {1998}, address = {Melbourne, AU}, pages = {90--95}, url = {http://cobar.cs.umass.edu/pubfiles/ir-121.ps}, abstract = {Several standard text-categorization techniques were applied to the problem of automated essay grading. Bayesian independence classifiers and k-nearest-neighbor classifiers were trained to assign scores to manually-graded essays. These scores were combined with several other summary text measures using linear regression. The classifiers and regression equations were then applied to a new set of essays. The classifiers worked very well. The agreement between the automated grader and the final manual grade was as good as the agreement between human graders.}, } @inProceedings{Larkey99, author = {Leah S. Larkey}, title = {A patent search and classification system}, booktitle = {Proceedings of DL-99, 4th ACM Conference on Digital Libraries}, editor = {Edward A. Fox and Neil Rowe}, publisher = {ACM Press, New York, US}, year = {1999}, address = {Berkeley, US}, pages = {179--187}, url = {http://cobar.cs.umass.edu/pubfiles/ir-162.ps}, abstract = {We present a system for searching and classifying U.S. patent documents, based on Inquery. Patents are distributed through hundreds of collections, divided up by general area. The system selects the best collections for the query. Users can search for pants or classify patent text. The user interface helps users search in fields without requiring the knowledge of Inquery query operators. The system includes a unique phrase help facility, which helps users find and add phrases and terms related to those in their query.}, } @inProceedings{Lee00, author = {Hahn-Ming Lee and Chih-Ming Chen and Cheng-Wei Hwang}, title = {A neural network document classifier with linguistic feature selection}, booktitle = {Proceedings of IEA/AIE-00, 13th International Conference on Industrial and Engineering Applications of Artificial Intelligence and Expert Systems}, publisher = {}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {555--560}, url = {}, abstract = {}, } @inProceedings{Lee02, author = {Yong-Bae Lee and Sung H. Myaeng}, title = {Text Genre Classification with Genre-Revealing and Subject-Revealing Features}, booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on Research and Development in Information Retrieval}, editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng and Kalervo J{\"{a}}rvelin}, publisher = {ACM Press, New York, US}, address = {Tampere, FI}, year = {2002}, pages = {145--150}, url = {http://doi.acm.org/10.1145/564376.564403}, abstract = {Subject or prepositional content has been the focus of most classification research. Genre or style, on the other hand, is a different and important property of text, and automatic text genre classification is becoming important for classification and retrieval purposes as well as for some natural language processing research. In this paper, we present a method for automatic genre classification that is based on statistically selected features obtained from both subject-classified and genre classified training data. The experimental results show that the proposed method outperforms a direct application of a statistical learner often used for subject classification. We also observe that the deviation formula and discrimination formula using document frequency ratios also work as expected. We conjecture that this dual feature set approach can be generalized to improve the performance of subject classification as well.}, } @inProceedings{Lee02a, author = {Michael D. Lee}, title = {Fast Text Classification Using Sequential Sampling Processes}, booktitle = {Proceedings of the 14th Australian Joint Conference on Artificial Intelligence}, editor = {Markus Stumptner and Dan Corbett and Michael J. Brooks}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Adelaide, AU}, year = {2002}, pages = {309--320}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2256}, url = {http://link.springer.de/link/service/series/0558/papers/2256/22560309.pdf}, abstract = {A central problem in information retrieval is the automated classification of text documents. While many existing methods achieve good levels of performance, they generally require levels of computation that prevent them from making sufficiently fast decisions in some applied setting. Using insights gained from examining the way humans make fast decisions when classifying text documents, two new text classification algorithms are developed based on sequential sampling processes. These algorithms make extremely fast decisions, because they need to examine only a small number of words in each text document. Evaluation against the Reuters-21578 collection shows both techniques have levels of performance that approach benchmark methods, and the ability of one of the classifiers to produce realistic measures of confidence in its decisions is shown to be useful for prioritizing relevant documents.}, } @inProceedings{Lee02c, author = {Kang Hyuk Lee and Judy Kay and Byeong Ho Kang and Uwe Rosebrock}, title = {A Comparative Study on Statistical Machine Learning Algorithms and Thresholding Strategies for Automatic Text Categorization}, booktitle = {Proceedings of PRICAI-02, 7th Pacific Rim International Conference on Artificial Intelligence}, editor = {Mitsuru Ishizuka and Abdul Sattar}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Tokyo, JP}, year = {2002}, pages = {444--453}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2417}, url = {http://link.springer.de/link/service/series/0558/papers/2417/24170444.pdf}, abstract = {Two main research areas in statistical text categorization are similarity-based learning algorithms and associated thresholding strategies. The combination of these techniques significantly influences the overall performance of text categorization. After investigating two similarity-based classifiers (k-NN and Rocchio) and three common thresholding techniques (RCut, PCut, and SCut), we describe a new learning algorithm known as the keyword association network (KAN) and a new thresholding strategy (RinSCut) to improve performance over existing techniques. Extensive experiments have been conducted on the Reuters-21578 and 20-Newsgroups data sets. The experimental results show that our new approaches give better results for both micro-averaged F1 and macro-averaged F1 scores.}, } @article{Lehnert94, author = {Wendy Lehnert and Stephen Soderland and David Aronow and Fangfang Feng and Avinoam Shmueli}, title = {Inductive text classification for medical applications}, journal = {Journal of Experimental and Theoretical Artificial Intelligence}, year = {1994}, number = {1}, volume = {7}, pages = {49--80}, url = {}, abstract = {}, } @article{Leopold02, author = {Leopold, Edda and Kindermann, J{\"{o}}rg}, title = {Text Categorization with Support Vector Machines: How to Represent Texts in Input Space?}, journal = {Machine Learning}, year = {2002}, volume = {46}, number = {1/3}, pages = {423--444}, url = {http://www.wkap.nl/article.pdf?380516}, abstract = {The choice of the kernel function is crucial to most applications of support vector machines. In this paper, however, we show that in the case of text classification, term-frequency transformations have a larger impact on the performance of SVM than the kernel itself. We discuss the role of importance-weights (e.g. document frequency and redundancy), which is not yet fully understood in the light of model complexity and calculation cost, and we show that time consuming lemmatization or stemming can be avoided even when classifying a highly inflectional language like German.}, } @article{Lertnattee04, author = {Verayuth Lertnattee and Thanaruk Theeramunkong}, title = {Effect of term distributions on centroid-based text categorization}, journal = {Information Sciences}, year = {2004}, number = {1}, volume = {158}, pages = {89--115}, url = {http://dx.doi.org/10.1016/j.ins.2003.07.007}, abstract = {Most of traditional text categorization approaches utilize term frequency (tf) and inverse document frequency (idf) for representing importance of words and/or terms in classifying a text document. This paper describes an approach to apply term distributions, in addition to tf and idf, to improve performance of centroid-based text categorization. Three types of term distributions, called inter-class, intra-class and in-collection distributions, are introduced. These distributions are useful to increase classification accuracy by exploiting information of (1) term distribution among classes, (2) term distribution within a class and (3) term distribution in the whole collection of training data. In addition, this paper investigates how these term distributions contribute to weight each term in documents, e.g., a high term distribution of a word promotes or demotes importance or classification power of that word. To this end, several centroid-based classifiers are constructed with different term weightings. Using various data sets, their performances are investigated and compared to a standard centroid-based classifier (TDIDF) and a centroid-based classifier modified with information gain. Moreover, we also compare them to two well-known methods: k-NN and naive Bayes. In addition to a unigram model of document representation, a bigram model is also explored. Finally, the effectiveness of term distributions to improve classification accuracy is explored with regard to the training set size and the number of classes.}, } @article{Leung97, author = {Chi-Hong Leung and Wing-Kay Kan}, title = {A Statistical Learning Approach to Automatic Indexing of Controlled Index Terms}, journal = {Journal of the American Society for Information Science}, year = {1997}, number = {1}, pages = {55--67}, volume = {48}, url = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=39602&PLACEBO=IE.pdf}, abstract = {A statistical learning approach to assigning controlled index terms is presented. In this approach, there are two processes: (1) The learning process and (2) the indexing process. The learning process constructs a relationship between an index term and the words relevant and irrelevant to it, based on the positive training set and negative training set, which are sample documents indexed by the index term, and those not indexed by it, respectively. The indexing process determines whether an index term is assigned to a certain document, based on the relationship constructed by the learning process, and the text found in the document. Furthermore, a learning feedback technique is introduced. This technique used in the learning process modifies the relationship between an index term and its relevant and irrelevant words to improve the learning performance and, thus, the indexing performance. Experimental results have shown that the statistical learning approach and the learning feedback technique are practical means to automatic indexing of controlled index terms.}, } @inProceedings{Lewis00, author = {Lewis, David D.}, title = {Machine learning for text categorization: background and characteristics}, booktitle = {Proceedings of the 21st Annual National Online Meeting}, editor = {Williams, Martha E.}, publisher = {Information Today, Medford, USA}, address = {New York, US}, year = {2000}, pages = {221--226}, url = {}, abstract = {Text categorization is of increasing interest in both controlled vocabulary indexing and other applications. Machine learning methods for automatically producing categorization rules have similarly seen increased attention, as a way to reduce the cost of fielding categorization systems. While the experimental literature on text categorization emphasizes effectiveness comparisons, we list a variety of other characteristics of learning approaches that are equally important to consider. Research on machine learning for text categorization, already advancing at a rapid pace, could be further accelerated if better test collections were available.}, } @article{Lewis04, author = {Lewis, David D. and Fan Li and Tony Rose and Yiming Yang}, title = {{Reuters Corpus Volume 1} as a text categorization test collection}, journal = {Journal of Machine Learning Research}, volume = {5}, month = {April}, pages = {361--397}, year = {2004}, url = {http://www.jmlr.org/papers/volume5/lewis04a/lewis04a.pdf}, abstract = {Reuters Corpus Volume I (RCV1) is an archive of over 800,000 manually categorized newswire stories recently made available by Reuters, Ltd. for research purposes. Use of this data for research on text categorization requires a detailed understanding of the real world constraints under which the data was produced. Drawing on interviews with Reuters personnel and access to Reuters documentation, we describe the coding policy and quality control procedures used in producing the RCV1 data, the intended semantics of the hierarchical category taxonomies, and the corrections necessary to remove errorful data. We refer to the original data as RCV1-v1, and the corrected data as RCV1-v2. We benchmark several widely used supervised learning methods on RCV1-v2, illustrating the collection's properties, suggesting new directions for research, and providing baseline results for future studies. We make available detailed, per-category experimental results, as well as corrected versions of the category assignments and taxonomy structures, via online appendices.}, } @inProceedings{Lewis91, author = {Lewis, David D.}, title = {Data extraction as text categorization: An experiment with the {MUC-3} corpus.}, booktitle = {Proceedings of MUC-3, 3rd Message Understanding Conference}, editor = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, address = {San Diego, US}, pages = {245--255}, year = {1991}, url = {http://www.research.att.com/~lewis/papers/lewis91c.ps}, abstract = {[no abstract]}, } @inProceedings{Lewis92, author = {Lewis, David D.}, title = {An evaluation of phrasal and clustered representations on a text categorization task}, booktitle = {Proceedings of SIGIR-92, 15th ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark Pejtersen}, publisher = {ACM Press, New York, US}, address = {Kobenhavn, DK}, pages = {37--50}, year = {1992}, url = {http://www.research.att.com/~lewis/papers/lewis92b.ps}, abstract = {Syntactic phrase indexing and term clustering have been widely explored as text representation techniques for text retrieval. In this paper, we study the properties of phrasal and clustered indexing languages on a text categorization task, enabling us to study their properties in isolation from query interpretation issues. We show that optimal effectiveness occurs when using only a small proportion of the indexing terms available, and that effectiveness peaks at a higher feature set size and lower effectiveness level for a syntactic phrase indexing than for word-based indexing. We also present results suggesting that traditional term clustering methods are unlikely to provide significantly improved text representations. An improved probabilistic text categorization method is also presented.}, } @phdThesis{Lewis92a, author = {Lewis, David D.}, title = {Representation and learning in information retrieval}, school = {Department of Computer Science, University of Massachusetts}, address = {Amherst, US}, year = {1992}, url = {http://www.research.att.com/~lewis/papers/lewis91d.ps}, abstract = {This dissertation introduces a new theoretical model for text classification systems, including systems for document retrieval, automated indexing, electronic mail filtering, and similar tasks. The Concept Learning model emphasizes the role manual and automated feature selection and classifier formation in text classification. It enables drawing on results from statistics and machine learning in explaining the effectiveness of alternate representations of text, and specifies desirable characteristics of text representations. The use of syntactic parsing to produce indexing phrases has been widely investigated as a possible route to better text representations. Experiments with syntactic phrase indexing, however, have never yielded significant improvements in text retrieval performance. The Concept Learning model suggests that the poor statistical characteristics of a syntactic indexing phrase representation negate its dsirable semantic characteristics. The application of term clustering to this representation to improve its statistical properties while retaining its desirable meaning properties is proposed. Standard term clustering strategies from information retrieval (IR), based on cooccurence of indexing terms in documents or groups of documents, were tested on a syntactic indexing phrase representation. In experiments using a standard text retrieval test collection, small effectiveness improvements were obtained. As a means of evaluating representation quality, a text retrieval test collection introduces a number of confounding factors. In contrast, the text categorization task allows much cleaner determination of text representation properties. In preparation for the use of text categorization to study text representation, a more effective and theoretically well-founded probablistic text categorization algorithm was developed, building on work by Maron, Fuhr, and others. Text categorization experiments supported a number of predictions of the Concept Learning model about properties of phrasal representations, including dimensionality properties not previously measured for text representations. However, in carefully controlled experiments using syntactic phrases produced by Church's stochastic bracketer, in conjunction with reciprocal nearest neighbor clustering, term clustering was found to produce essentially no improvement in the properties of the phrasal representation. New cluster analysis approaches are proposed to remedy the problems found in traditional term clustering methods.}, } @inProceedings{Lewis94, author = {Lewis, David D. and Marc Ringuette}, title = {A comparison of two learning algorithms for text categorization}, booktitle = {Proceedings of SDAIR-94, 3rd Annual Symposium on Document Analysis and Information Retrieval}, publisher = {}, editor = {}, year = {1994}, address = {Las Vegas, US}, pages = {81--93}, url = {http://www.research.att.com/~lewis/papers/lewis94b.ps}, abstract = {This paper examines the use of inductive learning to categorize natural language documents into predefined content categories. Categorization of text is of increasing importance in information retrieval and natural language processing systems. Previous research on automated text categorization has mixed machine learning and knowledge engineering methods, making it difficult to draw conclusions about the performance of particular methods. In this paper we present empirical results on the performance of a Bayesian classifier and a decision tree learning algorithm on two text categorization data sets. We find that both algorithms achieve reasonable performance and allow controlled tradeoffs between false positives and false negatives. The stepwise feature selection in the decision tree algorithm is particularly effective in dealing with the large feature sets common in text categorization. However, even this algorithm is aided by an initial prefiltering of features, confirming the results found by Almuallim and Dietterich on artificial data sets. We also demonstrate the impact of the time-varying nature of category definitions.}, } @inProceedings{Lewis94a, author = {Lewis, David D. and Gale, William A.}, title = {A sequential algorithm for training text classifiers}, booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Van Rijsbergen, Cornelis J.}, publisher = {Springer Verlag, Heidelberg, DE}, year = {1994}, address = {Dublin, IE}, pages = {3--12}, note = {See also~\cite{Lewis95a}}, url = {http://www.research.att.com/~lewis/papers/lewis94c.ps}, abstract = {The ability to cheaply train text classifiers is critical to their use in information retrieval, content analysis, natural language processing, and other tasks involving data which is partly or fully textual. An algorithm for sequential sampling during machine learning of statistical classifiers was developed and tested on a newswire text categorization task. This method, which we call uncertainty sampling, reduced by as much as 500-fold the amount of training data that would have to be manually classified to achieve a given level of effectiveness.}, } @article{Lewis94b, author = {Lewis, David D. and Philip J. Hayes}, title = {Guest editors' introduction to the special issue on text categorization}, journal = {ACM Transactions on Information Systems}, volume = {12}, number = {3}, pages = {231}, year = {1994}, } @inProceedings{Lewis94c, author = {Lewis, David D. and Jason Catlett}, title = {Heterogeneous uncertainty sampling for supervised learning}, booktitle = {Proceedings of ICML-94, 11th International Conference on Machine Learning}, editor = {William W. Cohen and Haym Hirsh}, year = {1994}, address = {New Brunswick, US}, pages = {148--156}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.research.att.com/~lewis/papers/lewis94e.ps}, abstract = {Uncertainty sampling methods iteratively request class labels for training instances whose classes are uncertain despite the previous labeled instances. These methods can greatly reduce the number of instances that an expert need label. One problem with this approach is that the classifier best suited for an application may be too expensive to train or use during the selection of instances. We test the use of one classifier (a highly efficient probabilistic one) to select examples for training another (the C4.5 rule induction program). Despite being chosen by this heterogeneous approach, the uncertainty samples yielded classifiers with lower error rates than random samples ten times larger.}, } @inProceedings{Lewis95, author = {Lewis, David D.}, title = {Evaluating and optmizing autonomous text classification systems}, booktitle = {Proceedings of SIGIR-95, 18th ACM International Conference on Research and Development in Information Retrieval}, editor = {Edward A. Fox and Peter Ingwersen and Raya Fidel}, publisher = {ACM Press, New York, US}, year = {1995}, address = {Seattle, US}, pages = {246--254}, url = {http://www.research.att.com/~lewis/papers/lewis95b.ps}, abstract = {Text retrieval systems typically produce a ranking of documents and let a user decide how far down that ranking to go. In contrast, programs that filter text streams, software that categorizes documents, agents which alert users, and many other IR systems must make decisions without human input or supervision. It is important to define what constitutes good effectiveness for these autonomous systems, tune the systems to achieve the highest possible effectiveness, and estimate how the effectiveness changes as new data is processed. We show how to do this for binary text classification systems, emphasizing that different goals for the system lead to different optimal behaviors. Optimizing and estimating effectiveness is greatly aided if classifiers that explicitly estimate the probability of class membership are used.}, } @article{Lewis95a, author = {Lewis, David D.}, title = {A sequential algorithm for training text classifiers: corrigendum and additional data}, journal = {SIGIR Forum}, year = {1995}, pages = {13--19}, volume = {29}, number = {2}, url = {http://www.research.att.com/~lewis/papers/lewis95g.ps}, abstract = {Previously I compared the effectiveness of uncertainty sampling with that of random sampling and relevance sampling in choosing training data for a text categorization data set (Lewis and Gale, 1994). (Relevance sampling is the application of relevance feedback to producing a training sample.) I have discovered a bug in my experimental software which caused the relevance sampling results reported in the paper to be incorrect. (The uncertainty sampling and random sampling results in that paper were correct.) I have since fixed the bug and rerun the experiments. This note presents the corrected results, along with additional data supporting the original claim that uncertainty sampling has an advantage over relevance sampling in most training situations.}, } @inProceedings{Lewis95b, author = {David D. Lewis}, title = {The {TREC-4} filtering track: description and analysis}, booktitle = {Proceedings of TREC-4, 4th Text Retrieval Conference}, publisher = {National Institute of Standards and Technology, Gaithersburg, US}, editor = {Donna K. Harman and Ellen M. Voorhees}, year = {1995}, address = {Gaithersburg, US}, pages = {165--180}, url = {http://www.research.att.com/~lewis/papers/lewis96b.ps}, abstract = {The TREC-4 (4th Text REtrieval Conference) filtering track was an experiment in the evaluation of binary text classification systems. In contrast to ranking systems, binary text classification systems may need to produce result sets of any size, requiring that sampling be used to estimate their effectiveness. We present an effectiveness measure based on utility, and two sampling strategies (pooling and stratified sampling) for estimating the utility of the submitted sets. An evaluation of four sites was successfully carried out using this approach.}, } @inProceedings{Lewis96, author = {Lewis, David D. and Robert E. Schapire and James P. Callan and Ron Papka}, title = {Training algorithms for linear text classifiers}, booktitle = {Proceedings of SIGIR-96, 19th ACM International Conference on Research and Development in Information Retrieval}, editor = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and Ross Wilkinson}, publisher = {ACM Press, New York, US}, year = {1996}, address = {Z{\"{u}}rich, CH}, pages = {298--306}, url = {http://www.research.att.com/~lewis/papers/lewis96d.ps}, abstract = {Systems for text retrieval, routing, categorization and other IR tasks rely heavily on linear classifiers. We propose that two machine learning algorithms, the Widrow-Hoff and EG algorithms, be used in training linear text classifiers. In contrast to most IR methods, theoretical analysis provides performance guarantees and guidance on parameter settings for these algorithms. Experimental data is presented showing Widrow-Hoff and EG to be more effective than the widely used Rocchio algorithm on several categorization and routing tasks.}, } @misc{Lewis97a, author = {Lewis, David D.}, title = {Reuters-21578 text categorization test collection. Distribution 1.0}, year = {1997}, note = {Available as {\tt http://www.daviddlewis.com/resources/testcollections/reuters21578/readme.txt}}, url = {http://www.daviddlewis.com/resources/testcollections/reuters21578/readme.txt}, abstract = {[no abstract]}, } @inProceedings{Lewis98, author = {Lewis, David D.}, title = {Naive (Bayes) at forty: The independence assumption in information retrieval.}, booktitle = {Proceedings of ECML-98, 10th European Conference on Machine Learning}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1398}, editor = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol}, address = {Chemnitz, DE}, pages = {4--15}, year = {1998}, url = {http://www.research.att.com/~lewis/papers/lewis98b.ps}, abstract = {The naive Bayes classifier, currently experiencing a renaissance in machine learning, has long been a core technique in information retrieval. We review some of the variations of naive Bayes models used for text retrieval and classification, focusing on the distributional assumptions made about word occurrences in documents.}, } @inProceedings{Lewis99, author = {Lewis, David D. and Daniel L. Stern and Amit Singhal}, title = {{\sc Attics}: a software platform for on-line text classification}, booktitle = {Proceedings of SIGIR-99, 22nd ACM International Conference on Research and Development in Information Retrieval}, editor = {Marti A. Hearst and Fredric Gey and Richard Tong}, publisher = {ACM Press, New York, US}, address = {Berkeley, US}, year = {1999}, pages = {267--268}, url = {http://www.acm.org/pubs/articles/proceedings/ir/312624/p267-lewis/p267-lewis.pdf}, abstract = {Numerous systems for ranked retrieval on text databases have been implemented by both information retrieval researchers and in the commercial sector. In contrast, software for text categorization, message filtering, textual data mining, and related tasks is less common. ATTICS is an extensible text classification system we have implemented in C++. It supports incremental training and online application of classifiers and predictive models to streams of textual, numeric, symbolic, and hybrid data records. An object-oriented design allows easy addition of new preprocessors, machine learning algorithms, and classifier types.}, } @article{Li02, author = {Hang Li and Kenji Yamanishi}, title = {Text classification using ESC-based stochastic decision lists}, journal = {Information Processing and Management}, pages = {343--361}, year = {2002}, number = {3}, volume = {38}, url = {}, abstract = {We propose a new method of text classification using stochastic decision lists. A stochastic decision list is an ordered sequence of IF-THEN-ELSE rules, and our method can be viewed as a rule-based method for text classification having advantages of readability and refinability of acquired knowledge. Our method is unique in that decision lists are automatically constructed on the basis of the principle of minimizing extended stochastic complexity (ESC), and with it we are able to construct decision lists that have fewer errors in classification. The accuracy of classification achieved with our method appears better than or comparable to those of existing rule-based methods. We have empirically demonstrated that rule-based methods like ours result in high classification accuracy when the categories to which texts are to be assigned are relatively specific ones and when the texts tend to be short. We have also empirically verified the advantages of rule-based methods over non-rule-based ones.}, } @inProceedings{Li02a, author = {Xin Li and Dan Roth}, title = {Learning question classifiers}, booktitle = {Proceedings of COLING-02, 19th International Conference on Computational Linguistics}, editor = {}, publisher = {}, address = {Taipei, TW}, url = {http://l2r.cs.uiuc.edu/~danr/Papers/qc-coling02.pdf}, year = {2002}, abstract = {In order to respond correctly to a free form factual question given a large collection of texts, one needs to understand the question to a level that allows determining some of the constraints the question imposes on a possible answer. These constraints may include a semantic classification of the sought after answer and may even suggest using different strategies when looking for and verifying a candidate answer. This paper presents a machine learning approach to question classification. We learn a hierarchical classifier that is guided by a layered semantic hierarchy of answer types, and eventually classifies questions into finegrained classes. We show accurate results on a large collection of free-form questions used in TREC 10.}, } @inProceedings{Li03, author = {Cong Li and Ji-Rong Wen and Hang Li}, title = {Text Classification Using Stochastic Keyword Generation}, booktitle = {Proceedings of ICML-03, 20th International Conference on Machine Learning}, editor = {}, year = {2003}, address = {Washington, DC}, pages = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {}, abstract = {}, } @inProceedings{Li03a, author = {Fan Li and Yiming Yang}, title = {A Loss Function Analysis for Classification Methods in Text Categorization}, booktitle = {Proceedings of ICML-03, 20th International Conference on Machine Learning}, editor = {}, year = {2003}, address = {Washington, DC}, pages = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {}, abstract = {}, } @inProceedings{Li03b, author = {Tao Li and Shenghuo Zhu and Mitsunori Ogihara}, title = {Efficient multi-way text categorization via generalized discriminant analysis}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {317--324}, url = {http://doi.acm.org/10.1145/956863.956924}, abstract = {Text categorization is an important research area and has been receiving much attention due to the growth of the on-line information and of Internet. Automated text categorization is generally cast as a multi-class classification problem. Much of previous work focused on binary document classification problems. Support vector machines (SVMs) excel in binary classification, but the elegant theory behind large-margin hyperplane cannot be easily extended to multi-class text classification. In addition, the training time and scaling are also important concerns. On the other hand, other techniques naturally extensible to handle multi-class classification are generally not as accurate as SVM. This paper presents a simple and efficient solution to multi-class text categorization. Classification problems are first formulated as optimization via discriminant analysis. Text categorization is then cast as the problem of finding coordinate transformations that reflects the inherent similarity from the data. While most of the previous approaches decompose a multi-class classification problem into multiple independent binary classification tasks, the proposed approach enables direct multi-class classification. By using Generalized Singular Value Decomposition (GSVD), a coordinate transformation that reflects the inherent class structure indicated by the generalized singular values is identified. Extensive experiments demonstrate the efficiency and effectiveness of the proposed approach.}, } @inProceedings{Li91, author = {Wei Li and B. Lee and F. Krausz and K. Sahin}, title = {Text classification by a neural network}, booktitle = {Proceedings of the 23rd Annual Summer Computer Simulation Conference}, editor = {}, publisher = {}, address = {Baltimore, US}, pages = {313--318}, year = {1991}, url = {}, abstract = {When banks process their free-form telex traffic, the first task is the classification of the telexes. Historically, several attempts have been made to automate this process, using various stock phrases as the features on which to base the classification. This is a problem in which there are large amounts of data available, but the rules for classification are not explicitly available. For solving these kinds of problems, neural networks have the advantage of extracting the underlying relationships between the input data and the output classes automatically. Based on this consideration, the authors have built a neural network classification system, which has three subsystems: a user-maintainable feature definition subsystem, a feature extraction subsystem, and a neural network subsystem. The neural network is simulated on a VAX computer with a fast learning algorithm, and is combined with some non-statistical knowledge from the feature definition system. Above 90\% correct recognition rates have been achieved for the major categories concerned. The system is also applicable to text classification problems other than telex classification.}, } @inProceedings{Li97, author = {Hang Li and Kenji Yamanishi}, title = {Document classification using a finite mixture model}, booktitle = {Proceedings of ACL-97, 35th Annual Meeting of the Association for Computational Linguistics}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, editor = {Philip R. Cohen and Wolfgang Wahlster}, year = {1997}, address = {Madrid, ES}, pages = {39--47}, url = {http://xxx.lanl.gov/ps/cmp-lg/9705005}, abstract = {We propose a new method of classifying documents into categories. The simple method of conducting hypothesis testing over word-based distributions in categories suffers from the data sparseness problem. In order to address this difficulty, Guthrie et.al. have developed a method using distributions based on hard clustering of words, i.e., in which a word is assigned to a single cluster and words in the same cluster are treated uniformly. This method might, however, degrade classification results, since the distributions it employs are not always precise enough for representing the differences between categories. We propose here the use of soft clustering of words, i.e., in which a word can be assigned to several different clusters and each cluster is characterized by a specific word probability distribution. We define for each document category a finite mixture model, which is a linear combination of the probability distributions of the clusters. We thereby treat the problem of classifying documents as that of conducting statistical hypothesis testing over finite mixture models. In order to accomplish this testing, we employ the EM algorithm which helps efficiently estimate parameters in a finite mixture model. Experimental results indicate that our method outperforms not only the method using distributions based on hard clustering, but also the method using word-based distributions and the method based on cosine-similarity.}, } @article{Li98a, author = {Li, Yong H. and Jain, Anil K.}, title = {Classification of text documents}, journal = {The Computer Journal}, year = {1998}, volume = {41}, number = {8}, pages = {537--546}, url = {}, abstract = {The exponential growth of the Internet has led to a great deal of interest in developing useful and efficient tools and software to assist users in searching the Web. Document retrieval, categorization, routing and filtering can all be formulated as classification problems. However, the complexity of natural languages and the extremely high dimensionality of the feature space of documents have made this classification problem very difficult. We investigate four different methods for document classification: the naive Bayes classifier, the nearest neighbour classifier, decision trees and a subspace method. These were applied to seven-class Yahoo news groups (business, entertainment, health, international, politics, sports and technology) individually and in combination, We studied three classifier combination approaches: simple voting, dynamic classifier selection and adaptive classifier combination. Our experimental results indicate that the naive Bayes classifier and the subspace method outperform the other two classifiers on our data sets. Combinations of multiple classifiers did not always improve the classification accuracy compared to the best individual classifier. Among the three different combination approaches, our adaptive classifier combination method introduced here performed the best.}, } @inProceedings{Li99, author = {Hang Li and Kenji Yamanishi}, title = {Text classification using ESC-based stochastic decision lists}, booktitle = {Proceedings of CIKM-99, 8th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {1999}, address = {Kansas City, US}, pages = {122--130}, url = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p122-li/p122-li.pdf}, abstract = {We propose a new method of text classification using stochastic decision lists. A stochastic decision list is an ordered sequence of IF-THEN rules, and our method can be viewed as a rule-based method for text clsssification having advantages of readability and refinability of acquired knowledge. Our method is unique in that decision lists are automatically constructed on the basis of the principle of minimizing Extended Stochastic Complexity (ESC), and with it we are able to construct decision lists that have fewer errors in classification. The accuracy of classification achieved with our method appears better than or comparable to those of existing rule-based methods.}, } @inProceedings{Liao02, author = {Yihua Liao and V. Rao Vemuri}, title = {Using Text Categorization Techniques for Intrusion Detection}, booktitle = {Proceedings of the 11th USENIX Security Symposium}, publisher = {}, editor = {Dan Boneh}, year = {2002}, address = {San Francisco, US}, pages = {51--59}, url = {http://www.usenix.org/publications/library/proceedings/sec02/liao.html}, abstract = {A new approach, based on the k-Nearest Neighbor (kNN) classifier, is used to classify program behavior as normal or intrusive. Short sequences of system calls have been used by others to characterize a program's normal behavior before. However, separate databases of short system call sequences have to be built for different programs, and learning program profiles involves time-consuming training and testing processes. With the kNN classifier, the frequencies of system calls are used to describe the program behavior. Text categorization techniques are adopted to convert each process to a vector and calculate the similarity between two program activities. Since there is no need to learn individual program profiles separately, the calculation involved is largely reduced. Preliminary experiments with 1998 DARPA BSM audit data show that the kNN classifier can effectively detect intrusive attacks and achieve a low false positive rate.}, } @article{Liddy94, author = {Elizabeth D. Liddy and Woojin Paik and Edmund S. Yu}, title = {Text categorization for multiple users based on semantic features from a machine-readable dictionary}, journal = {ACM Transactions on Information Systems}, year = {1994}, number = {3}, volume = {12}, pages = {278--295}, url = {http://www.acm.org/pubs/articles/journals/tois/1994-12-3/p278-liddy/p278-liddy.pdf}, abstract = {The text categorization module described in the paper provides a front-end filtering function for the larger DR-LINK text retrieval system (Liddy and Myaeng 1993). The module evaluates a large incoming stream of documents to determine which documents are sufficiently similar to a profile at the broad subject level to warrant more refined representation and matching. To accomplish this task, each substantive word in a text is first categorized using a feature set based on the semantic subject field codes (SFCs) assigned to individual word senses in a machine-readable dictionary. When tested on 50 user profiles and 550 megabytes of documents, results indicate that the feature set that is the basis of the text categorization module and the algorithm that establishes the boundary of categories of potentially relevant documents accomplish their tasks with a high level of performance. This means that the category of potentially relevant documents for most profiles would contain at least 80\% of all documents later determined to be relevant to the profile. The number of documents in this set would be uniquely determined by the system's category-boundary predictor, and this set is likely to contain less than 5\% of the incoming stream of documents.}, } @inProceedings{Liere97, author = {Ray Liere and Prasad Tadepalli}, title = {Active learning with committees for text categorization}, booktitle = {Proceedings of AAAI-97, 14th Conference of the American Association for Artificial Intelligence}, editor = {}, publisher = {AAAI Press, Menlo Park, US}, year = {1997}, pages = {591--596}, address = {Providence, US}, url = {http://www.rdrop.com/~lierer/aaai97.ps}, abstract = {In many real-world domains, supervised learning requires a large number of training examples. In this paper, we describe an active learning method that uses a committee of learners to reduce the number of training examples required for learning. Our approach is similar to the Query by Committee framework, where disagreement among the committee members on the predicted label for the input part of the example is used to signal the need for knowing the actual value of the label. Our experiments are conducted in the text categorization domain, which is characterized by a large number of features, many of which are irrelevant. We report here on experiments using a committee of Winnow-based learners and demonstrate that this approach can reduce the number of labeled training examples required over that used by a single Winnow learner by 1-2 orders of magnitude.}, } @inProceedings{Liere98, author = {Ray Liere and Prasad Tadepalli}, title = {Active Learning with Committees: Preliminary Results in Comparing Winnow and Perceptron in Text Categorization}, booktitle = {Proceedings of CONALD-98, 1st Conference on Automated Learning and Discovery}, editor = {}, publisher = {AAAI Press, Menlo Park, US}, year = {1998}, pages = {}, address = {Pittsburgh, US}, url = {http://www.rdrop.com/~lierer/conald98.ps}, abstract = {The availability of vast amounts of information on the World Wide Web has created a big demand for automatic tools to organize and index that information. Unfortunately, the paradigm of supervised machine learning is ill-suited to this task, as it assumes that the training examples are classified by a teacher - usually a human. In this paper, we describe an active learning method based on Query by Committee (QBC) that reduces the number of labeled training examples (text documents) required for learning by 1-2 orders of magnitude.}, } @inProceedings{Lim99, author = {Lim, Joo Hwee}, title = {Learnable visual keywords for image classification}, booktitle = {Proceedings of DL-99, 4th ACM Conference on Digital Libraries}, editor = {Edward A. Fox and Neil Rowe}, publisher = {ACM Press, New York, US}, year = {1999}, address = {Berkeley, US}, pages = {139--145}, url = {http://www.acm.org/pubs/articles/proceedings/dl/313238/p139-lim/p139-lim.pdf}, abstract = {Automatic categorization of multimedia documents is an important function for a digital library system. While text categorization has received much attentions by IR researchers, classification of visual data is at its infancy stage. In this paper, we propose a notion of visual keywords for similarity matching between visual contents. Visual keywords can be constructed automatically from samples of visual data through supervised/unsupervised learning. Given a visual content, the occurrences of visual keywords are detected, summarized spatially, and coded via singular value decomposition to arrive at a concise coded description. The methods to create, detect, summarize, select, and code visual keywords will be detailed. Last but not least, we describe an evaluation experiment that classifies professional nature scenery photographs to demonstrate the effectiveness and efficiency of visual keywords for automatic categorization of images in digital libraries.}, } @article{Liu01, author = {Zhi-Qiang Liu and Ya-Jun Zhang}, title = {A competitive neural network approach to web-page categorization}, journal = {International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems}, volume = {9}, number = {6}, pages = {731--741}, year = {2001}, } @inProceedings{Liu02, author = {Yan Liu and Yiming Yang and Jaime Carbonell}, title = {Boosting to Correct the Inductive Bias for Text Classification}, booktitle = {Proceedings of CIKM-02, 11th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2002}, address = {McLean, US}, pages = {348--355}, url = {http://doi.acm.org/10.1145/584792.584850}, abstract = {This paper studies the effects of boosting in the context of different classification methods for text categorization, including Decision Trees, Naive Bayes, Support Vector Machines (SVMs) and a Rocchio-style classifier. We identify the inductive biases of each classifier and explore how boosting, as an error-driven resampling mechanism, reacts to those biases. Our experiments on the Reuters-21578 benchmark show that boosting is not effective in improving the performance of the base classifiers on common categories. However, the effect of boosting for rare categories varies across classifiers: for SVMs and Decision Trees, we achieved a 13-17\% performance improvement in macro-averaged F1 measure, but did not obtain substantial improvement for the other two classifiers. This interesting finding of boosting on rare categories has not been reported before.}, } @inProceedings{Liu03, author = {Yan Liu and Jaime Carbonell and Rong Jin}, title = {A New Pairwise Ensemble Approach for Text Classification}, booktitle = {Proceedings of ECML-03, 14th European Conference on Machine Learning}, publisher = {}, editor = {}, year = {2003}, address = {Dubrovnik, HK}, pages = {}, url = {}, abstract = {}, } @article{Lodhi02, author = {Huma Lodhi and Craig Saunders and John Shawe-Taylor and Nello Cristianini and Chris Watkins}, title = {Text Classification using String Kernels}, journal = {Journal of Machine Learning Research}, volume = {2}, pages = {419--444}, year = {2002}, url = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/lodhi02a/lodhi02a.pdf}, abstract = {We propose a novel approach for categorizing text documents based on the use of a special kernel. The kernel is an inner product in the feature space generated by all subsequences of length k. A subsequence is any ordered sequence of k characters occurring in the text though not necessarily contiguously. The subsequences are weighted by an exponentially decaying factor of their full length in the text, hence emphasising those occurrences that are close to contiguous. A direct computation of this feature vector would involve a prohibitive amount of computation even for modest values of k, since the dimension of the feature space grows exponentially with k. The paper describes how despite this fact the inner product can be efficiently evaluated by a dynamic programming technique. Experimental comparisons of the performance of the kernel compared with a standard word feature space kernel (Joachims, 1998) show positive results on modestly sized datasets. The case of contiguous subsequences is also considered for comparison with the subsequences kernel with different decay factors. For larger documents and datasets the paper introduces an approximation technique that is shown to deliver good approximations efficiently for large datasets.}, } @inProceedings{Macskassy01, author = {Sofus A. Macskassy and Haym Hirsh and Arunava Banerjee and Aynur A. Dayanik}, title = {Using Text Classifiers for Numerical Classification}, booktitle = {Proceeding of IJCAI-01, 17th International Joint Conference on Artificial Intelligence}, editor = {Bernhard Nebel}, address = {Seattle, US}, year = {2001}, pages = {885--890}, url = {http://www.cs.rutgers.edu/~sofmac/paper/ijcai2001/macskassy-ijcai2001.pdf}, abstract = {Consider a supervised learning problem in which examples contain both numerical- and text-valued features. To use traditional feature-vector- based learning methods, one could treat the presence or ab-sence of a word as a Boolean feature and use these binary-valued features together with the numerical features. However, the use of a text-classification system on this is a bit more problematic - in the most straight-forward approach each number would be considered a distinct token and treated as a word. This paper presents an alter-native approach for the use of text classification methods for super-vised learning problems with numerical-valued features in which the numerical features are converted into bag-of-words features, thereby making them directly usable by text classification methods. We show that even on purely numerical-valued data the results of text-classification on the derived text-like representation outperforms the more naive numbers-as-tokens representation and, more importantly, is competitive with mature numerical classification methods such as C4.5 and Ripper.}, } @article{Macskassy03, author = {Sofus A. Macskassy and Haym Hirsh and Arunava Banerjee and Aynur A. Dayanik}, title = {Converting numerical classification into text classification}, journal = {Artificial Intelligence}, volume = {143}, number = {1}, year = {2003}, pages = {51--77}, url = {}, abstract = {}, } @inProceedings{Macskassy03a, author = {Sofus A. Macskassy and Haym Hirsh}, title = {Adding numbers to text classification}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {240--246}, url = {http://doi.acm.org/10.1145/956863.956910}, abstract = {Many real-world problems involve a combination of both text- and numerical-valued features. For example, in email classification, it is possible to use instance representations that consider not only the text of each message, but also numerical-valued features such as the length of the message or the time of day at which it was sent. Text-classification methods have thus far not easily incorporated numerical features. In earlier work we described an approach for converting numerical features into bags of tokens so that text classification methods can be applied to numerical classification problems, and showed that the resulting learning methods are competitive with traditional numerical classification methods. In this paper we use this as a way to learn on problems that involve a combination of text and numbers. We show that the results outperform competing methods. Further, we show that selecting a best classification method using text-only features and then adding numerical features to the problem (as might happen if numerical features are only later added to a pre existing text-classification problem) gives performance that rivals a more time-consuming approach of evaluating all classification methods using the full set of both text and numerical features.}, } @article{Maderlechner97, author = {Maderlechner, G. and Suda, P. and Bruckner, T.}, title = {Classification of documents by form and content}, journal = {Pattern Recognition Letters}, pages = {1225--1231}, year = {1997}, volume = {18}, number = {11/13}, url = {}, abstract = {This paper presents a modular software system, which classifies a large variety of office documents according to layout form and textual content. It consists of the following components: layout analysis, pre-classification, OCR interface, fuzzy string matching, text categorization, lexical, syntactical and semantic analysis. The system has been applied to the following tasks: presorting of forms, reports and letters, index extraction for archiving and retrieval, page type classification and text column analysis of real estate register documents, in-house mail sorting and electronic distribution to departments. The architecture, modules, and practical results are described.}, } @article{Manevitz01, author = {Larry M. Manevitz and Malik Yousef}, title = {One-Class {SVMs} for Document Classification}, journal = {Journal of Machine Learning Research}, volume = {2}, month = {December}, pages = {139--154}, year = {2001}, url = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/manevitz01a/manevitz01a.pdf}, abstract = {We implemented versions of the SVM appropriate for one-class classification in the context of information retrieval. The experiments were conducted on the standard Reuters data set. For the SVM implementation we used both a version of Schoelkopf et al. and a somewhat different version of one-class SVM based on identifying ``outlier" data as representative of the second-class. We report on experiments with different kernels for both of these implementations and with different representations of the data, including binary vectors, tf-idf representation and a modification called ``Hadamard" representation. Then we compared it with one-class versions of the algorithms prototype (Rocchio), nearest neighbor, naive Bayes, and finally a natural one-class neural network classification method based on ``bottleneck" compression generated filters. The SVM approach as represented by Schoelkopf was superior to all the methods except the neural network one, where it was, although occasionally worse, essentially comparable. However, the SVM methods turned out to be quite sensitive to the choice of representation and kernel in ways which are not well understood; therefore, for the time being leaving the neural network approach as the most robust.}, } @inBook{Manning99a, author = {Christopher Manning and Hinrich Sch{\"{u}}tze}, title = {Foundations of Statistical Natural Language Processing}, publisher = {The MIT Press}, address = {Cambridge, US}, year = {1999}, chapter = {16: Text Categorization}, pages = {575--608}, url = {}, abstract = {}, } @article{Maron61, author = {M.E. Maron}, title = {Automatic indexing: an experimental inquiry}, year = {1961}, journal = {Journal of the Association for Computing Machinery}, volume = {8}, number = {3}, pages = {404--417}, url = {http://www.acm.org/pubs/articles/journals/jacm/1961-8-3/p404-maron/p404-maron.pdf}, abstract = {This inquiry examines a technique for automatically classifying (indexing) documents according to their subject content. The task, in essence, is to have a computing machine read a document and on the basis of the occurrence of selected clue words decide to which of many subject categories the document in question belongs. This paper describes the design, execution and evaluation of a modest experimental study aimed at testing empirically one statistical technique for automatic indexing.}, } @inProceedings{Marton05, author = {Yuval Marton and Ning Wu and Lisa Hellerstein}, title = {On compression-based text classification}, booktitle = {Proceedings of ECIR-05, 27th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {David E. Losada and Juan M. Fern{'{a}}ndez-Luna}, address = {Santiago De Compostela, ES}, year = {2005}, pages = {300--314}, url = {}, abstract = {Compression-based text classification methods are easy to apply, requiring virtually no preprocessing of the data. Most such methods are character-based, and thus have the potential to automatically capture non-word features of a document, such as punctuation, word-stems, and features spanning more than one word. However, compression-based classification methods have drawbacks (such as slow running time), and not all such methods are equally effective. We present the results of a number of experiments designed to evaluate the effectiveness and behavior of different compression-based text classification methods on English text. Among our experiments are some specifically designed to test whether the ability to capture non-word features causes character-based text compression methods to achieve more accurate classification.}, } @inProceedings{Masand92, author = {Briji Masand and Gordon Linoff and David Waltz}, title = {Classifying news stories using memory-based reasoning}, booktitle = {Proceedings of SIGIR-92, 15th ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark Pejtersen}, publisher = {ACM Press, New York, US}, address = {Kobenhavn, DK}, pages = {59--65}, year = {1992}, url = {http://www.acm.org/pubs/articles/proceedings/ir/133160/p59-masand/p59-masand.pdf}, abstract = {We describe a method for classifying news stories using Memory Based Reasoning (MBR) a k-nearest neighbor method), that does not require manual topic definitions. Using an already coded training database of about 50,000 stories from the Dow Jones Press Release News Wire, and SEEKER [Stanfill] (a text retrieval system that supports relevance feedback) as the underlying match engine, codes are assigned to new, unseen stories with a recall of about 80\% and precision of about 70\%. There are about 350 different codes to be assigned. Using a massively parallel supercomputer, we leverage the information already contained in the thousands of coded stories and are able to code a story in about 2 seconds. Given SEEKER, the text retrieval system, we achieved these results in about two person-months. We believe this approach is effective in reducing the development time to implement classification systems involving large number of topics for the purpose of classification, message routing etc.}, } @inCollection{Masand94, author = {Briji Masand}, title = {Optimising confidence of text classification by evolution of symbolic expressions}, booktitle = {Advances in genetic programming}, publisher = {The MIT Press}, address = {Cambridge, US}, year = {1994}, chapter = {21}, editor = {Kenneth E. Kinnear}, pages = {459--476}, url = {}, abstract = {}, } @inProceedings{Matsuda98, author = {Katsushi Matsuda and Toshikazu Fukushima}, title = {Task-oriented {World Wide Web} retrieval by document type classification}, booktitle = {Proceedings of CIKM-98, 7th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Georges Gardarin and James C. French and Niki Pissinou and Kia Makki and Luc Bouganim}, year = {1998}, address = {Bethesda, US}, pages = {109--113}, url = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p109-matsuda/p109-matsuda.pdf}, abstract = {This paper proposes a novel approach to accurately searching Web pages for relevant information in problem solving by specifying a Web document category instead of the user's task. Accessing information from World Wide Web pages as an approach to problem solving has become commonplace. However, such a search is difficult with current search services, since these services only provide keyword-based search methods that are equivalent to narrowing down the target references according to domains. However, problem solving usually involves both a domain and a task. Accordingly,' our approach is based on problem solving tasks. To specify a user's problem solving task, we introduce the concept of document types that directly relate to the problem solving tasks; with this approach, users can easily designate problem solving tasks. We implemented PageTypeSearch system based on our approach. Classifier of PageTypeSearch classifies Web pages into the document types by comparing their pages with typical structural characteristics of the types. We compare PageTypeSearch using the document type-indices with a conventional keyword-based search system in experiments. The average precision of the document type-based search is 88.9\%, while the average precision of the keyword-based search is 31.2\%. Moreover, the number of irrelevant references gathered by our system is about one-thirteenth that of traditional keyword-based search systems. Our approach has practical advantages for problem solving by introducing the viewpoint of tasks to achieve higher performance.}, } @inProceedings{McCallum98, author = {Andrew K. McCallum and Kamal Nigam}, title = {Employing {EM} in pool-based active learning for text classification}, booktitle = {Proceedings of ICML-98, 15th International Conference on Machine Learning}, editor = {Jude W. Shavlik}, year = {1998}, address = {Madison, US}, pages = {350--358}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cmu.edu/~mccallum/papers/emactive-icml98.ps.gz}, abstract = {The paper shows how a text classifier's need for labeled training documents can be reduced by taking advantage of a large pool of unlabeled documents. We modify the Query-by-Committee (QBC) method of active learning to use the unlabeled pool for explicitly estimating document density when selecting examples for labeling. Then active learning is combined with Expectation-Maximization in order to ``fill in'' the class labels of those documents that remain unlabeled. Experimental results show that the improvements to active learning require less than two-thirds as many labeled training examples as previous QBC approaches, and that the combination of EM and active learning requires only slightly more than half as many labeled training examples to achieve the same accuracy as either the improved active learning or EM alone.}, } @inProceedings{McCallum98b, author = {Andrew K. McCallum and Ronald Rosenfeld and Tom M. Mitchell and Andrew Y. Ng}, title = {Improving text classification by shrinkage in a hierarchy of classes}, booktitle = {Proceedings of ICML-98, 15th International Conference on Machine Learning}, editor = {Jude W. Shavlik}, year = {1998}, address = {Madison, US}, pages = {359--367}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cmu.edu/~mccallum/papers/hier-icml98.ps.gz}, abstract = {When documents are organized in a large number of topic categories, the categories are often arranged in a hierarchy. The US patent database and Yahoo are two examples. The paper shows that the accuracy of a naive Bayes text classifier can be significantly improved by taking advantage of a hierarchy of classes. We adopt an established statistical technique called shrinkage that smooths parameter estimates of a data-sparse child with its parent in order to obtain more robust parameter estimates. The approach is also employed in deleted interpolation, a technique for smoothing n-grams in language modeling for speech recognition. Our method scales well to large data sets, with numerous categories in large hierarchies. Experimental results on three real world data sets from UseNet, Yahoo, and corporate Web pages show improved performance, with a reduction in error up to 29\% over the traditional flat classifier.}, } @inProceedings{McCallum98c, author = {Andrew McCallum and K. Nigam}, title = {A comparison of event models for Naive Bayes text classification}, booktitle = {Proceedings of AAAI-98, Workshop on Learning for Text Categorization}, year = {1998}, url = {citeseer.nj.nec.com/mccallum98comparison.html}, } @inProceedings{Meretakis00, author = {Dimitris Meretakis and Dimitris Fragoudis and Hongjun Lu and Spiros Likothanassis}, title = {Scalable Association-based Text Classification}, booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, address = {McLean, US}, editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner}, year = {2000}, pages = {373--374}, url = {http://www.cs.ust.hk/~meretaks/papers/mfll-cikm2000.pdf}, abstract = {Naive Bayes (NB) classifier has long been considered a core methodology in text classification mainly due to its simplicity and computational efficiency. There is an increasing need however for methods that can achieve higher classification accuracy while maintaining the ability to process large document collections. In this paper we examine text categorization methods from a perspective that considers the tradeoff between accuracy and scalability to large data sets and large feature sizes. We start from the observation that Support Vector Machines, one of the best text categorization methods cannot scale up to handle the large document collections involved in many real word problems. We then consider bayesian extensions to NB that achieve higher accuracy by relaxing its strong independence assumptions. Our experimental results show that LB, an association-based lazy classifier can achieve a good tradeoff between high classification accuracy and scalability to large document collections and large feature sizes.}, } @article{Merkl98, author = {Merkl, Dieter}, title = {Text classification with self-organizing maps: Some lessons learned}, journal = {Neurocomputing}, year = {1998}, volume = {21}, number = {1/3}, pages = {61--77}, url = {}, abstract = {We discuss ways of using self-organizing maps for document classification. Furthermore, we focus on the fact that document collections lend themselves naturally to a hierarchical structure defined by the subject matter of the documents. We take advantage of this fact by using a hierarchically organized neural network, built up from a number of independent self-organizing maps in order to enable the true establishment of a document taxonomy. Using such an architecture, the time needed for training is reduced substantially and the user is provided with an even more intuitive metaphor for visualization. Since the single layers of self-organizing maps represent different aspects of the document collection at different levels of detail, the neural network shows the document collection in a form comparable to an atlas where the user may easily select the most appropriate degree of granularity depending on the actual focus of interest during the exploration of the document collection.}, } @inProceedings{Meyer04, author = {Sven {Meyer Zu Eissen} and Benno Stein}, title = {Genre Classification of Web Pages}, booktitle = {Proceedings of KI-04, 27th German Conference on Artificial Intelligence}, publisher = {}, editor = {Biundo, Susanne and Fr{\"{u}}hwirth, Thom and Palm, G{\"{u}}nther}, address = {Ulm, DE}, year = {2004}, pages = {}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 3238}, url = {http://www-ai.upb.de/aisearch/ki04-frame.pdf}, abstract = {Genre classification means to discriminate between documents by means of their form, their style, or their targeted audience. Put another way, genre classification is orthogonal to a classification based on the documents' contents. While most of the existing investigations of an automated genre classification are based on news articles corpora, the idea here is applied to arbitrary Web pages. We see genre classification as a powerful instrument to bring Web-based search services closer to a user's information need. This objective raises two questions: (1) What are useful genres when searching the WWW? (2) Can these genres be reliably identified? The paper in hand presents results from a user study on Web genre usefulness as well as results from the construction of a genre classifier using discriminant analysis, neural network learning, and support vector machines. Particular attention is turned to a classifier's underlying feature set: Aside from the standard feature types we introduce new features that are based on word frequency classes and that can be computed with minimum computational effort. They allow us to construct compact feature sets with few elements, with which a satisfactory genre diversi- fication is achieved. About 70\% of the Web-documents are assigned to their true genre; note in this connection that no genre classification benchmark for Web pages has been published so far.}, } @article{Mladenic03, author = {Dunja Mladeni{\'{c}} and Marko Grobelnik}, title = {Feature selection on hierarchy of Web documents}, journal = {Decision Support Systems}, year = {2003}, number = {1}, volume = {35}, pages = {45--87}, url = {}, abstract = {The paper describes feature subset selection used in learning on text data (text learning) and gives a brief overview of feature subset selection commonly used in machine learning. Several known and some new feature scoring measures appropriate for feature subset selection on large text data are described and related to each other. Experimental comparison of the described measures is given on real-world data collected from the Web. Machine learning techniques are used on data collected from Yahoo, a large text hierarchy of Web documents. Our approach includes some original ideas for handling large number of features, categories and documents. The high number of features is reduced by feature subset selection and additionally by using `stop-list', pruning low-frequency features and using a short description of each document given in the hierarchy instead of using the document itself. Documents are represented as feature-vectors that include word sequences instead of including only single words as commonly used when learning on text data. An efficient approach to generating word sequences is proposed. Based on the hierarchical structure, we propose a way of dividing the problem into subproblems, each representing one of the categories included in the Yahoo hierarchy. In our learning experiments, for each of the subproblems, naive Bayesian classifier was used on text data. The result of learning is a set of independent classifiers, each used to predict probability that a new example is a member of the corresponding category. Experimental evaluation on real-world data shows that the proposed approach gives good results. The best performance was achieved by the feature selection based on a feature scoring measure known from information retrieval called Odds ratio and using relatively small number of features.}, } @inProceedings{Mladenic04, author = {Dunja Mladeni{\'{c}} and Janez Brank and Marko Grobelnik and Natasa Mili{\'{c}}-Frayling}, title = {Feature selection using linear classifier weights: interaction with classification models}, booktitle = {Proceedings of SIGIR-04, 27th ACM International Conference on Research and Development in Information Retrieval}, editor = {Kalervo J{\"{a}}rvelin and James Allan and Peter Bruza and Mark Sanderson}, publisher = {ACM Press, New York, US}, address = {Sheffield, UK}, year = {2004}, pages = {234--241}, url = {http://doi.acm.org/10.1145/1008992.1009034}, abstract = {This paper explores feature scoring and selection based on weights from linear classification models. It investigates how these methods combine with various learning models. Our comparative analysis includes three learning algorithms: Naive Bayes, Perceptron, and Support Vector Machines (SVM) in combination with three feature weighting methods: Odds Ratio, Information Gain, and weights from linear models, the linear SVM and Perceptron. Experiments show that feature selection using weights from linear SVMs yields better classification performance than other feature weighting methods when combined with the three explored learning algorithms. The results support the conjecture that it is the sophistication of the feature weighting method rather than its apparent compatibility with the learning algorithm that improves classification performance.}, } @inProceedings{Mladenic98a, author = {Dunja Mladeni{\'{c}}}, title = {Turning {{\sc Yahoo!}}\ into an automatic Web page classifier}, booktitle = {Proceedings of ECAI-98, 13th European Conference on Artificial Intelligence}, publisher = {John Wiley and Sons, Chichester, UK}, editor = {Henri Prade}, year = {1998}, pages = {473--474}, address = {Brighton, UK}, url = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwECAI98yr.ps.gz}, abstract = {The paper describes an approach to automatic Web-page classification based on the Yahoo hierarchy. Machine learning techniques developed for learning on text data are used here on the hierarchical classification structure. The high number of features is reduced by taking into account the hierarchical structure and using feature subset selection based on the method known from information retrieval. Documents are represented as feature-vectors that include n-grams instead of including only single words (unigrams) as commonly used when learning on text data. Based on the hierarchical structure the problem is divided into subproblems, each representing one on the categories included in the Yahoo hierarchy. The result of learning is a set of independent classifiers, each used to predict the probability that a new example is a member of the corresponding category. Experimental evaluation on real-world data shows that the proposed approach gives good results. For more than a half of testing examples a correct category is among the 3 categories with the highest predicted probability.}, } @inProceedings{Mladenic98b, author = {Dunja Mladeni{\'{c}}}, title = {Feature subset selection in text learning}, booktitle = {Proceedings of ECML-98, 10th European Conference on Machine Learning}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1398}, editor = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol}, address = {Chemnitz, DE}, pages = {95--100}, year = {1998}, url = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwECML98.ps.gz}, abstract = {This paper describes several known and some new methods for feature subset selection on large text data. Experimental comparison given on real-world data collected from Web users shows that characteristics of the problem domain and machine learning algorithm should be considered when feature scoring measure is selected. Our problem domain consists of hyperlinks given in a form of small-documents represented with word vectors. In our learning experiments naive Bayesian classifier was used on text data. The best performance was achieved by the feature selection methods based on the feature scoring measure called Odds ratio that is known from information retrieval.}, } @phdThesis{Mladenic98c, author = {Dunja Mladeni{\'{c}}}, title = {Machine Learning on non-homogeneous, distributed text data}, school = {J.\ Stefan Institute, University of Ljubljana}, address = {Ljubljana, SL}, year = {1998}, url = {http://www-ai.ijs.si/DunjaMladenic/papers/PhD/PhDFinal.ps}, abstract = {}, } @inProceedings{Mladenic98d, author = {Dunja Mladeni{\'{c}} and Marko Grobelnik}, title = {Word sequences as features in text-learning}, booktitle = {Proceedings of ERK-98, the Seventh Electrotechnical and Computer Science Conference}, year = {1998}, address = {Ljubljana, SL}, pages = {145--148}, } @article{Mladenic99, author = {Dunja Mladeni{\'{c}}}, title = {Text learning and related intelligent agents: a survey}, journal = {IEEE Intelligent Systems}, year = {1999}, number = {4}, volume = {14}, pages = {44--54}, url = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/agentOverIEEE.ps.gz}, abstract = {Analysis of text data using intelligent information retrieval, machine learning, natural language processing or other related methods is becoming an important issue for the development of intelligent agents. There are two frequently used approaches to the development of intelligent agents using machine learning techniques: a content-based and a collaborative approach. In the first approach, the content (eg., text) plays an important role, while in the second approach, the existence of several knowledge sources (eg., several users) is required. We can say that the usage of machine learning techniques on text databases (usually referred to as text-learning) is an important part of the content-based approach. Examples are agents for locating information on World Wide Web and Usenet news filtering agents. There are different research questions important for the development of text-learning intelligent agents. We focus on three of them: what representation is used for documents, how is the high number of features dealt with and which learning algorithm is used. These questions are addressed in an overview of the existing approaches to text classification. For illustration we give a brief description of the content-based personal intelligent agent named Personal WebWatcher that uses text-learning for user customized Web browsing.}, } @inProceedings{Mladenic99a, author = {Dunja Mladeni{\'{c}} and Marko Grobelnik}, title = {Feature selection for unbalanced class distribution and Naive Bayes}, booktitle = {Proceedings of ICML-99, 16th International Conference on Machine Learning}, editor = {Ivan Bratko and Saso Dzeroski}, year = {1999}, address = {Bled, SL}, pages = {258--267}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwICML99Final.ps.gz}, abstract = {This paper describes an approach to feature subset selection that takes into account problem specifics and learning algorithm characteristics. It is developed for the Naive Bayesian classifier applied on text data, since it combines well with the addressed learning problems. We focus on domains with many features that also have a highly unbalanced class distribution and asymmetric misclassification costs given only implicitly in the problem. By asymmetric misclassification costs we mean that one of the class values is the target class value for which we want to get predictions and we prefer false positive over false negative. Our example problem is automatic document categorization using machine learning, where we want to identify documents relevant for the selected category. Usually, only about 1\%-10\% of examples belong to the selected category. Our experimental comparison of eleven feature scoring measures show that considering domain and algorithm characteristics significantly improves the results of classification.}, } @article{Moens00, author = {Marie-Francine Moens and Jos Dumortier}, title = {Text categorization: the assignment of subject descriptors to magazine articles}, journal = {Information Processing and Management}, pages = {841--861}, year = {2000}, number = {6}, volume = {36}, url = {}, abstract = {Automatic text categorization is an important research area and has a potential for many text-based applications including text routing and filtering. Typical text classifiers learn from example texts that are manually categorized. When categorizing magazine articles with broad subject descriptors, we study three aspects of text classification: (1) effective selection of feature words and proper names that reflect the main topics of the text; (2) learning algorithms; and (3) improvement of the quality of the learned classifier by selection of examples. The chi(2) test, which is sometimes used for selecting terms that are highly related to a text class, is applied in a novel way when constructing a category weight vector. Despite a limited number of training examples, combining an effective feature selection with the chi(2) learning algorithm for training the text classifier results in an adequate categorization of new magazine articles.}, } @inProceedings{Mooney00, author = {Raymond J. Mooney and Loriene Roy}, title = {Content-based book recommending using learning for text categorization}, booktitle = {Proceedings of DL-00, 5th ACM Conference on Digital Libraries}, editor = {}, publisher = {ACM Press, New York, US}, year = {2000}, address = {San Antonio, US}, pages = {195--204}, url = {ftp://ftp.cs.utexas.edu/pub/mooney/papers/libra-dl-00.ps.gz}, abstract = {Recommender systems improve access to relevant products and information by making personalized suggestions based on previous examples of a user's likes and dislikes. Most existing recommender systems use collaborative filtering methods that base recommendations on other users' preferences. By contrast, content-based methods use information about an item itself to make suggestions. This approach has the advantage of being able to recommend previously unrated items to users with unique interests and to provide explanations for its recommendations. We describe a content-based book recommending system that utilizes information extraction and a machine-learning algorithm for text categorization. Initial experimental results demonstrate that this approach can produce accurate recommendations.}, } @inProceedings{Moschitti03, author = {Alessandro Moschitti}, title = {A study on optimal parameter tuning for Rocchio text classifier}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {420--435}, url = {http://link.springer.de/link/service/series/0558/papers/2633/26330420.pdf}, abstract = {Current trend in operational text categorization is the designing of fast classification tools. Several studies on improving accuracy of fast but less accurate classifiers have been recently carried out. In particular, enhanced versions of the Rocchio text classifier, characterized by high performance, have been proposed. However, even in these extended formulations the problem of tuning its parameters is still neglected. In this paper, a study on parameters of the Rocchio text classifier has been carried out to achieve its maximal accuracy. The result is a model for the automatic selection of parameters. Its main feature is to bind the searching space so that optimal parameters can be selected quickly. The space has been bound by giving a feature selection interpretation of the Rocchio parameters. The benefit of the approach has been assessed via extensive cross evaluation over three corpora in two languages. Comparative analysis shows that the performances achieved are relatively close to the best TC models (e.g. Support Vector Machines).}, } @inProceedings{Moschitti04, author = {Alessandro Moschitti and Roberto Basili}, title = {Complex Linguistic Features for Text Classification: A Comprehensive Study}, booktitle = {Proceedings of ECIR-04, 26th European Conference on Information Retrieval Research}, editor = {Sharon McDonald and John Tait}, year = {2004}, address = {Sunderland, UK}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2997}, pages = {181--196}, url = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2997&spage=181}, abstract = {Previous researches on advanced representations for document retrieval have shown that statistical state-of-the-art models are not improved by a variety of different linguistic representations. Phrases, word senses and syntactic relations derived by Natural Language Processing (NLP) techniques were observed ineffective to increase retrieval accuracy. For Text Categorization (TC) are available fewer and less definitive studies on the use of advanced document representations as it is a relatively new research area (compared to document retrieval). In this paper, advanced document representations have been investigated. Extensive experimentation on representative classifiers, Rocchio and SVM, as well as a careful analysis of the literature have been carried out to study how some NLP techniques used for indexing impact TC. Cross validation over 4 different corpora in two languages allowed us to gather an overwhelming evidence that complex nominals, proper nouns and word senses are not adequate to improve TC accuracy.}, } @article{Mostafa00, author = {Javed Mostafa and Wai Lam}, title = {Automatic classification using supervised learning in a medical document filtering application}, journal = {Information Processing and Management}, year = {2000}, volume = {36}, number = {3}, pages = {415--444}, url = {}, abstract = {Document classifiers can play an intermediate role in multilevel filtering systems. The effectiveness of a classifier that uses supervised learning was analyzed in terms of its accuracy and ultimately its influence on filtering. The analysis was conducted in two phases. In the first phase, a multilayer feed-forward neural network was trained to classify medical documents in the area of cell biology. The accuracy of the supervised classifier was established by comparing its performance with a baseline system that uses human classification information. A relatively high degree of accuracy was achieved by the supervised method, however, classification accuracy varied across classes. In the second phase, to clarify the impact of this performance on filtering, different types of user profiles were created by grouping subsets of classes based on their individual classification accuracy rates. Then, a filtering system with the neural network integrated into it was used to filter the medical documents and this performance was compared with the filtering results achieved using the baseline system. The performance of the system using the neural network classifier was generally satisfactory and, as expected, the filtering performance varied with regard to the accuracy rates of classes.}, } @inProceedings{Moulinier96, author = {Isabelle Moulinier and Gailius Ra{\u{s}}kinis and Jean-Gabriel Ganascia}, title = {Text categorization: a symbolic approach}, booktitle = {Proceedings of SDAIR-96, 5th Annual Symposium on Document Analysis and Information Retrieval}, publisher = {}, editor = {}, address = {Las Vegas, US}, year = {1996}, pages = {87--99}, url = {http://www-poleia.lip6.fr/~moulinie/sdair.ps.gz}, abstract = {Recent research in machine learning has been concerned with scaling-up to large data sets. Since information retrieval is a domain where such data sets are widespread, it provides an ideal application area for machine learning. This paper studies the ability of symbolic learning algorithms to perform a text categorization task. This ability depends on both text representation and feature filtering. We present a unified view of text categorization systems, focusing on the selection of features. A new selection technique, SCAR, is proposed for k-DNF (disjunctive normal form) learners and evaluated on the Reuters financial data set. Even though our experimental results do not outperform earlier approaches, they give rise to promising perspectives.}, } @inProceedings{Moulinier96a, author = {Isabelle Moulinier and Jean-Gabriel Ganascia}, title = {Applying an existing machine learning algorithm to text categorization}, booktitle = {Connectionist, statistical, and symbolic approaches to learning for natural language processing}, editor = {Stefan Wermter and Ellen Riloff and Gabriele Scheler}, pages = {343--354}, year = {1996}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1040}, url = {http://www-poleia.lip6.fr/~moulinie/wijcai.ps.gz}, abstract = {The information retrieval community is becoming increasingly interested in machine learning techniques, of which text categorization is an application. This paper describes how we have applied an existing similarity-based learning algorithm, CHARADE, to the text categorization problem and compares the results with those obtained using decision tree construction algorithms. From a machine learning point of view, this study was motivated by the size of the inspected data in such applications. Using the same representation of documents, CHARADE offers better performance than earlier reported experiments with decision trees on the same corpus. In addition, the way in which learning with redundancy influences categorization performance is also studied.}, } @inProceedings{Moulinier97, author = {Isabelle Moulinier}, title = {Feature selection: a useful preprocessing step}, booktitle = {Proceedings of BCSIRSG-97, the 19th Annual Colloquium of the British Computer Society Information Retrieval Specialist Group}, publisher = {Springer Verlag, Heidelberg, DE}, series = {Electronic Workshops in Computing}, editor = {Jonathan Furner and David Harper}, address = {Aberdeen, UK}, year = {1997}, pages = {}, url = {http://www.ewic.org.uk/ewic/workshop/fetch.cfm/IRR-97/Moulinier/Moulinier.ps}, abstract = {Statistical classification techniques and machine learning methods have been applied to some information retrieval (IR) problems: routing, filtering and categorization. Most of these methods are usually awkward and sometimes intractable in high-dimensional feature spaces. In order to reduce dimensionality, feature selection has been introduced as a preprocessing step. In this paper, we assess to what extent feature selection can be used without causing a loss in effectiveness. This problem can be tackled since a couple of recent learners (Ripper and Scar) do not require a preprocessing step. On a text categorization task, using the Reuters-22,173 collection, we give empirical evidence that feature selection is useful: first, the size of the collection index can be drastically reduced without causing a significant loss in categorization effectiveness. Then, we show that feature selection speeds up the time required to automatically build the categorization system.}, } @inProceedings{Myers00, author = {Kary Myers and Michael Kearns and Satinder Singh and Marilyn A. Walker}, title = {A Boosting Approach to Topic Spotting on Subdialogues}, booktitle = {Proceedings of ICML-00, 17th International Conference on Machine Learning}, editor = {Pat Langley}, year = {2000}, address = {Stanford, US}, pages = {655--662}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cmu.edu/~rayid/mypapers/ecoc-icml.ps}, abstract = {We report the results of a study on topic spotting in conversational speech. Using a machine learning approach, we build classifiers that accept an audio file of conversational human speech as input, and output an estimate of the topic being discussed. Our methodology makes use of a well-known corpus of transcribed and topic-labeled speech (the Switchboard corpus), and involves an interesting double use of the BOOSTEXTER learning algorithm. Our work is distinguished from previous efforts in topic spotting by our explicit study of the effects of dialogue length on classifier performance, and by our use of off-the-shelf speech recognition technology. One of our main results is the identification of a single classifier with good performance (relative to our classifier space) across all subdialogue lengths.}, } @inProceedings{Nardiello03, author = {Pio Nardiello and Fabrizio Sebastiani and Alessandro Sperduti}, title = {Discretizing continuous attributes in AdaBoost for text categorization}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {320--334}, url = {http://www.math.unipd.it/~fabseb60/Publications/ECIR03.pdf}, abstract = {We focus on two recently proposed algorithms in the family of ``boosting''-based learners for automated text classification, \textsc{AdaBoost.MH} and \textsc{AdaBoost.MH$^KR$}. While the former is a realization of the well-known \textsc{AdaBoost} algorithm specifically aimed at multi-label text categorization, the latter is a generalization of the former based on the idea of learning a committee of classifier sub-committees. Both algorithms have been among the best performers in text categorization experiments so far. A problem in the use of both algorithms is that they require documents to be represented by binary vectors, indicating presence or absence of the terms in the document. As a consequence, these algorithms cannot take full advantage of the ``weighted'' representations (consisting of vectors of continuous attributes) that are customary in information retrieval tasks, and that provide a much more significant rendition of the document's content than binary representations. In this paper we address the problem of exploiting the potential of weighted representations in the context of \textsc{AdaBoost}-like algorithms by discretizing the continuous attributes through the application of entropy-based discretization methods. We present experimental results on the \textsf{Reuters-21578} text categorization collection, showing that for both algorithms the version with discretized continuous attributes outperforms the version with traditional binary representations.}, } @inProceedings{Ng97, author = {Hwee T. Ng and Wei B. Goh and Kok L. Low}, title = {Feature selection, perceptron learning, and a usability case study for text categorization}, booktitle = {Proceedings of SIGIR-97, 20th ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and A. Desai Narasimhalu and Peter Willett}, publisher = {ACM Press, New York, US}, year = {1997}, address = {Philadelphia, US}, pages = {67--73}, url = {http://www.acm.org/pubs/articles/proceedings/ir/258525/p67-ng/p67-ng.pdf}, abstract = {In this paper, we describe an automated learning approach to text categorization based on perceptron learning and a new feature selection metric, called correlation coefficient. Our approach has been tested on the standard Reuters text categorization collection. Empirical results indicate that our approach outperforms the best published results on this Reuters collection. In particular, our new feature selection method yields considerable improvement. We also investigate the usability of our automated learning approach by actually developing a system that categorizes texts into a tree of categories. We compare the accuracy of our learning approach to a rule-based, expert system approach that uses a text categorization shell built by Carnegie Group. Although our automated learning approach still gives a lower accuracy, by appropriately incorporating a set of manually chosen words to use as features, the combined, semi-automated approach yields accuracy close to the rule-based approach.}, } @article{Nieto02, author = {Salvador Nieto S{\'{a}}nchez and Evangelos Triantaphyllou and Donald Kraft}, title = {A feature mining based approach for the classification of text documents into disjoint classes}, journal = {Information Processing and Management}, year = {2002}, volume = {38}, number = {4}, pages = {583--604}, url = {}, abstract = {This paper proposes a new approach for classifying text documents into two disjoint classes. The new approach is based on extracting patterns, in the form of two logical expressions, which are defined on various features (indexing terms) of the documents. The pattern extraction is aimed at providing descriptions (in the form of two logical expressions) of the two classes of positive and negative examples. This is achieved by means of a data mining approach, called One Clause At a Time (OCAT), which is based on mathematical logic. The application of a logic-based approach to text document classification is critical when one wishes to be able to justify why a particular document has been assigned to one class versus the other class. This situation occurs, for instance, in declassifying documents that have been previously considered important to national security and thus are currently being kept as secret. Some computational experiments have investigated the effectiveness of the OCAT-based approach and compared it to the well-known vector space model (VSM). These tests also have investigated finding the best indexing terms that could be used in making these classification decisions. The results of these computational experiments on a sample of 2897 text documents from the TIPSTER collection indicate that the first approach has many advantages over the VSM approach for solving this type of text document classification problem. Moreover, a guided strategy for the OCAT-based approach is presented for deciding which document one needs to consider next while building the training example sets.}, } @article{Nigam00, author = {Kamal Nigam and Andrew K. McCallum and Sebastian Thrun and Tom M. Mitchell}, title = {Text Classification from Labeled and Unlabeled Documents using EM}, journal = {Machine Learning}, year = {2000}, number = {2/3}, volume = {39}, pages = {103--134}, url = {http://www.cs.cmu.edu/~knigam/papers/emcat-mlj99.ps}, abstract = {This paper shows that the accuracy of learned text classifiers can be improved by augmenting a small number of labeled training documents with a large pool of unlabeled documents. This is important because in many text classification problems obtaining training labels is expensive, while large quantities of unlabeled documents are readily available. We introduce an algorithm for learning from labeled and unlabeled documents based on the combination of Expectation-Maximization (EM) and a naive Bayes classifier. The algorithm first trains a classifier using the available labeled documents, and probabilistically labels the unlabeled documents. It then trains a new classifier using the labels for all the documents, and iterates to convergence. This basic EM procedure works well when the data conform to the generative assumptions of the model. However these assumptions are often violated in practice, and poor performance can result. We present two extensions to the algorithm that improve classification accuracy under these conditions: (1) a weighting factor to modulate the contribution of the unlabeled data, and (2) the use of multiple mixture components per class. Experimental results, obtained using text from three different real-world tasks, show that the use of unlabeled data reduces classification error by up to 30\%.}, } @inProceedings{Nigam00a, author = {Kamal Nigam and Rayid Ghani}, title = {Analyzing the applicability and effectiveness of co-training}, booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, address = {McLean, US}, editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner}, year = {2000}, pages = {86--93}, url = {http://www.cs.cmu.edu/~knigam/papers/cotrain-CIKM00.pdf}, abstract = {Recently there has been significant interest in supervised learning algorithms that combine labeled and unlabeled data for text learning tasks. The co-training setting applies to datasets that have a natural separation of their features into two disjoint sets. We demonstrate that when learning from labeled and unlabeled data, algorithms explicitly leveraging a natural independent split of the features outperform algorithms that do not. When a natural split does not exist, co-training algorithms that manufacture a feature split may outperform algorithms not using a split. These results help explain why co-training algorithms are both discriminative in nature and robust to the assumptions of their embedded classifiers.}, } @phdThesis{Nigam01, author = {Kamal Nigam}, title = {Using Unlabeled Data to Improve Text Classification}, school = {Computer Science Department, Carnegie Mellon University}, address = {Pittsburgh, US}, year = {2001}, url = {http://www-2.cs.cmu.edu/~knigam/papers/thesis-nigam.pdf}, abstract = {One key difficulty with text classification learning algorithms is that they require many hand-labeled examples to learn accurately. This disser- tation demonstrates that supervised learning algorithms that use a small number of labeled examples and many inexpensive unlabeled examples can create high-accuracy text classifiers. By assuming that documents are created by a parametric generative model, Expectation-Maximization (EM) finds local maximum a posteriori models and classifiers from all the data|labeled and unlabeled. These generative models do not capture all the intricacies of text; however on some domains this technique substan- tially improves classification accuracy, especially when labeled data are sparse. Two problems arise from this basic approach. First, unlabeled data can hurt performance in domains where the generative modeling assumptions are too strongly violated. In this case the assumptions can be made more representative in two ways: by modeling sub-topic class structure, and by modeling super-topic hierarchical class relationships. By doing so, model probability and classification accuracy come into correspondence, allowing unlabeled data to improve classification performance. The second problem is that even with a representative model, the improvements given by unlabeled data do not sufficiently compensate for a paucity of labeled data. Here, limited labeled data provide EM initializations that lead to low-probability models. Performance can be significantly improved by using active learning to select high-quality initializations, and by using alternatives to EM that avoid low-probability local maxima.}, } @inProceedings{Nigam98, author = {Kamal Nigam and Andrew K. McCallum and Sebastian Thrun and Tom M. Mitchell}, title = {Learning to classify text from labeled and unlabeled documents}, booktitle = {Proceedings of AAAI-98, 15th Conference of the American Association for Artificial Intelligence}, publisher = {AAAI Press, Menlo Park, US}, editor = {}, year = {1998}, pages = {792--799}, address = {Madison, US}, note = {An extended version appears as~\cite{Nigam00}}, url = {http://www.cs.cmu.edu/~knigam/papers/emcat-aaai98.ps}, abstract = {In many important text classification problems, acquiring class labels for training documents is costly, while gathering large quantities of unlabeled data is cheap. This paper shows that the accuracy of text classifiers trained with a small number of labeled documents can be improved by augmenting this small training set with a large pool of unlabeled documents. We present a theoretical argument showing that, under common assumptions, unlabeled data contain information about the target function. We then introduce an algorithm for learning from labeled and unlabeled text based on the combination of Expectation-Maximization with a naive Bayes classifier. The algorithm first trains a classifier using the available labeled documents, and probabilistically labels the unlabeled documents; it then trains a new classifier using the labels for all the documents, and iterates to convergence. Experimental results, obtained using text from three different real-world tasks, show that the use of unlabeled data reduces classification error by up to 33\%.}, } @inProceedings{Oh00, author = {Hyo-Jung Oh and Sung Hyon Myaeng and Mann-Ho Lee}, title = {A practical hypertext categorization method using links and incrementally available class information}, booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong}, publisher = {ACM Press, New York, US}, address = {Athens, GR}, year = {2000}, pages = {264--271}, url = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p264-oh/p264-oh.pdf}, abstract = {As WWW grows at an increasing speed, a classifier targeted at hypertext has become in high demand. While document categorization is quite a mature, the issue of utilizing hypertext structure and hyperlinks has been relatively unexplored. In this paper, we propose a practical method for enhancing both the speed and the quality of hypertext categorization using hyperlinks. In comparison against a recently proposed technique that appears to be the only one of the kind, we obtained up to 18.5\% of improvement in effectiveness while reducing the processing time dramatically. We attempt to explain through experiments what factors contribute to the improvement.}, } @inProceedings{Ontrup01, author = {J{\"{o}}rg Ontrup and Helge Ritter}, title = {Text Categorization and Semantic Browsing with Self-Organizing Maps on Non-Euclidean Spaces}, booktitle = {Proceedings of PKDD-01, 5th European Conference on Principles and Practice of Knowledge Discovery in Databases}, editor = {Luc De Raedt and Arno Siebes}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Freiburg, DE}, year = {2001}, pages = {338--349}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2168}, url = {http://www.techfak.uni-bielefeld.de/ags/ni/publications/papers/OntrupRitter2001-TCA.pdf}, abstract = {This paper introduces a new type of Self-Organizing Map (SOM) for Text Categorization and Semantic Browsing. We propose a ``hyperbolic SOM'' (HSOM) based on a regular tesselation of the hyperbolic plane, which is a non-euclidean space characterized by constant negative gaussian curvature. This approach is motivated by the observation that hyperbolic spaces possess a geometry where the size of a neighborhood around a point increases exponentially and therefore provides more freedom to map a complex information space such as language into spatial relations. These theoretical findings are supported by our experiments, which show that hyperbolic SOMs can successfully be applied to text categorization and yield results comparable to other state-of-the-art methods. Furthermore we demonstrate that the HSOM is able to map large text collections in a semantically meaningful way and therefore allows a ``semantic browsing'' of text databases.}, } @article{Paijmans98, author = {Paijmans, Hans}, title = {Text categorization as an information retrieval task}, journal = {The South African Computer Journal}, year = {1999}, pages = {4--15}, volume = {21}, url = {}, abstract = {A number of methods for feature reduction and feature selection in text classification and information retrieval systems are compared. These include feature sets that are constructed by Latent Semantic Indexing, `local dictionaries' in the form of the words that score highest in frequency in positive class examples and feature sets that are constructed by relevance feedback strategies such as J.J. Rocchio's (1971) feedback algorithm or genetic algorithms. Also, different derivations from the normal recall and precision performance indicators are discussed and compared. It was found that categorizers consisting of the words with highest tf.idf values scored best.}, } @inProceedings{Paliouras99, author = {Georgios Paliouras and Vangelis Karkaletsis and Constantine D. Spyropoulos}, title = {Learning rules for large vocabulary word sense disambiguation}, booktitle = {Proceedings of IJCAI-99, 16th International Joint Conference on Artificial Intelligence}, editor = {Thomas Dean}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = {1999}, pages = {674--679}, address = {Stockholm, SE}, url = {http://www.iit.demokritos.gr/~paliourg/papers/IJCAI99.ps.gz}, abstract = {Word Sense Disambiguation (WSD) is the process of distinguishing between different senses of a word. In general, the disambiguation rules differ for different words. For this reason, the automatic construction of disambiguation rules is highly desirable. One way to achieve this aim is by applying machine learning techniques to training data containing the various senses of the ambiguous words. In the work presented here, the decision tree learning algorithm C4.5 is applied on a corpus of financial news articles. Instead of concentrating on a small set of ambiguous words, as done in most of the related previous work, all content words of the examined corpus are disambiguated. Furthermore, the effectiveness of word sense disambiguation for different parts of speech (nouns and verbs) is examined empirically.}, } @inProceedings{Pang02, author = {Bo Pang and Lillian Lee and Shivakumar Vaithyanathan}, title = {Thumbs up? Sentiment Classification using Machine Learning Techniques}, booktitle = {Proceedings of EMNLP-02, 7th Conference on Empirical Methods in Natural Language Processing}, year = {2002}, publisher = {Association for Computational Linguistics, Morristown, US}, editor = {}, pages = {79--86}, address = {Philadelphia, US}, url = {http://acl.ldc.upenn.edu/acl2002/EMNLP/pdfs/EMNLP219.pdf}, abstract = {We consider the problem of classifying documents not by topic, but by overall sentiment, e.g., determining whether a review is positive or negative. Using movie reviews as data, we find that standard machine learning techniques definitively outperform human-produced baselines. However, the three machine learning methods we employed (Naive Bayes, maximum entropy classification, and support vector machines) do not perform as well on sentiment classification as on traditional topic-based categorization. We conclude by examining factors that make the sentiment classification problem more challenging.}, } @article{Park04, author = {Seong-Bae Park and Byoung-Tak Zhang}, title = {Co-trained support vector machines for large scale unstructured document classification using unlabeled data and syntactic information}, journal = {Information Processing and Management}, year = {2004}, volume = {40}, number = {3}, pages = {421--439}, url = {}, abstract = {}, } @inProceedings{Peng03, author = {Fuchun Peng and Dale Schuurmans}, title = {Combining naive Bayes $n$-gram and language models for text classification}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {335--350}, url = {http://link.springer.de/link/service/series/0558/papers/2633/26330335.pdf}, abstract = {We augment the naive Bayes model with an n-gram language model to address two shortcomings of naive Bayes text classifiers. The chain augmented naive Bayes classifiers we propose have two advantages over standard naive Bayes classifiers. First, a chain augmented naive Bayes model relaxes some of the independence assumptions of naive Bayes--allowing a local Markov chain dependence in the observed variables--while still permitting efficient inference and learning. Second, smoothing techniques from statistical language modeling can be used to recover better estimates than the Laplace smoothing techniques usually used in naive Bayes classification. Our experimental results on three real world data sets show that we achieve substantial improvements over standard naive Bayes classification, while also achieving state of the art performance that competes with the best known methods in these cases.}, } @inProceedings{Peng03a, author = {Fuchun Peng and Dale Schuurmans and Shaojun Wang}, title = {Language and Task Independent Text Categorization with Simple Language Models}, booktitle = {Proceedings of HLT-03, 3rd Human Language Technology Conference}, publisher = {}, editor = {}, address = {Edmonton, CA}, year = {2003}, pages = {}, url = {}, abstract = {}, } @inProceedings{Petasis00, author = {Georgios Petasis and Alessandro Cucchiarelli and Paola Velardi and Georgios Paliouras and Vangelis Karkaletsis and Constantine D. Spyropoulos}, title = {Automatic adaptation of proper noun dictionaries through cooperation of machine learning and probabilistic methods}, booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong}, publisher = {ACM Press, New York, US}, address = {Athens, GR}, year = {2000}, pages = {128--135}, url = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p128-petasis/p128-petasis.pdf}, abstract = {The recognition of Proper Nouns (PNs) is considered an important task in the area of Information Retrieval and Extraction. However the high performance of most existing PN classifiers heavily depends upon the availability of large dictionaries of domain-specific Proper Nouns, and a certain amount of manual work for rule writing or manual tagging. Though it is not a heavy requirement to rely on some existing PN dictionary (often these resources are available on the web), its coverage of a domain corpus may be rather low, in absence of manual updating. In this paper we propose a technique for the automatic updating of a PN Dictionary through the cooperation of an inductive and a probabilistic classifier. In our experiments we show that, whenever an existing PN Dictionary allows the identification of 50\% of the proper nouns within a corpus, our technique allows, without additional manual effort, the successful recognition of about 90\% of the remaining 50\%.}, } @inProceedings{Peters02, author = {C. Peters and Cornelis H. Koster}, title = {Uncertainty-based Noise Reduction and Term selection in Text Categorization}, booktitle = {Proceedings of ECIR-02, 24th European Colloquium on Information Retrieval Research}, editor = {Fabio Crestani and Mark Girolami and Van Rijsbergen, Cornelis J.}, year = {2002}, address = {Glasgow, UK}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2291}, pages = {248--267}, url = {http://link.springer.de/link/service/series/0558/papers/2291/22910248.pdf}, abstract = {This paper introduces a new criterium for term selection, which is based on the notion of Uncertainty. Term selection according to this criterium is performed by the elimination of noisy terms on a class-by-class basis, rather than by selecting the most significant ones. Uncertainty-based term selection (UC) is compared to a number of other criteria like Information Gain (IG), simplified chi-square (SX), Term Frequency (TF) and Document Frequency (DF) in a Text Categorization setting. Experiments on data sets with different properties (Reuters-21578, patent abstracts and patent applications) and with two different algorithms (Winnow and Rocchio) show that UC-based term selection is not the most aggressive term selection criterium, but that its effect is quite stable across data sets and algorithms. This makes it a good candidate for a general "install-and-forget" term selection mechanism. We also describe and evaluate a hybrid Term Selection technique, first applying UC to eliminate noisy terms and then using another criterium to select the best terms.}, } @inProceedings{Ragas98, author = {Hein Ragas and Cornelis H. Koster}, title = {Four text classification algorithms compared on a Dutch corpus}, booktitle = {Proceedings of SIGIR-98, 21st ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Alistair Moffat and Van Rijsbergen, Cornelis J. and Ross Wilkinson and Justin Zobel}, publisher = {ACM Press, New York, US}, year = {1998}, address = {Melbourne, AU}, pages = {369--370}, url = {http://www.acm.org/pubs/articles/proceedings/ir/290941/p369-ragas/p369-ragas.pdf}, abstract = {We describe an experiment in applying text classification algorithms to Dutch texts. Four well known learning algorithms: Rocchio's algorithm (W.W. Cohen and Y. Singer, 1995), the Simple Bayesian Classifier (SEC) (R.O. Duda and P.E. Hart, 1973), the Sleeping Experts (SE) and Winnow (I. Dagan et al., 1997) were implemented. They were tested on a corpus of articles from the Dutch newspaper NRC, and pre-classified into four categories. The algorithms are compared on learning speed and error rate. We also investigated the effect of discarding terms, using either a dynamic stoplist or the Winnow heuristic.}, } @inProceedings{Rahal04, author = {Imad Rahal and William Perrizo}, title = {An optimized approach for KNN text categorization using P-trees}, booktitle = {Proceedings of SAC-04, 19th ACM Symposium on Applied Computing}, editor = {}, pages = {613--617}, address = {Nicosia, CY}, year = {2004}, url = {http://doi.acm.org/10.1145/967900.968026}, abstract = {The importance of text mining stems from the availability of huge volumes of text databases holding a wealth of valuable information that needs to be mined. Text categorization is the process of assigning categories or labels to documents based entirely on their contents. Formally, it can be viewed as a mapping from the document space into a set of predefined class labels (aka subjects or categories); F: D <- {C1, C2...Cn} where F is the mapping function, D is the document space and {C1, C2...Cn} is the set of class labels. Given an unlabeled document d, we need to find its class label, Ci, using the mapping function F where F(d) = Ci. In this paper, an optimized k-Nearest Neighbors (KNN) classifier that uses intervalization and the P-tree1 technology to achieve a high degree of accuracy, space utilization and time efficiency is proposed: As new samples arrive, the classifier finds the k nearest neighbors to the new sample from the training space without a single database scan.}, } @inProceedings{Raskutti01, author = {Bhavani Raskutti and Herman Ferr{\'{a}} and Adam Kowalczyk}, title = {Second Order Features for Maximising Text Classification Performance}, booktitle = {Proceedings of ECML-01, 12th European Conference on Machine Learning}, editor = {Luc De Raedt and Peter A. Flach}, year = {2001}, url = {http://link.springer.de/link/service/series/0558/papers/2167/21670454.pdf}, abstract = {The paper demonstrates that the addition of automatically selected word-pairs substantially increases the accuracy of text classification which is contrary to most previously reported research. The word-pairs are selected automatically using a technique based on frequencies of n-grams (sequences of characters), which takes into account both the frequencies of word-pairs as well as the context in which they occur. These improvements are reported for two different classifiers, support vector machines (SVM) and k-nearest neighbours (kNN), and two different text corpora. For the first of them, a collection of articles from PC Week magazine, the addition of word-pairs increases micro-averaged breakeven accuracy by more than 6\% point from a baseline accuracy (without pairs) of around 40\%. For second one, the standard Reuters benchmark, SVM classifier using augmentation with pairs outperforms all previously reported results.}, } @inProceedings{Rau91, author = {Lisa F. Rau and Paul S. Jacobs}, title = {Creating segmented databases from free text for text retrieval}, booktitle = {Proceedings of SIGIR-91, 14th ACM International Conference on Research and Development in Information Retrieval}, editor = {Abraham Bookstein and Yves Chiaramella and Gerard Salton and Vijay V. Raghavan}, publisher = {ACM Press, New York, US}, address = {Chicago, US}, pages = {337--346}, year = {1991}, url = {http://www.acm.org/pubs/articles/proceedings/ir/122860/p337-rau/p337-rau.pdf}, abstract = {Indexing text for accurate retrieval is a difficult and important problem. On-line information services generally depend on keyword indices rather than other methods of retrieval, because of the practical features of keywords for storage, dissemination, and browsing as well as for retrieval. However, these methods of indexing have two major drawbacks: First, they must be laboriously assigned by human indexers. Second, they are inaccurate, because of mistakes made by these indexers as well as the difficulties users have in choosing keywords for their queries, and the ambiguity a keyword may have. Current natural language text processing (NLP) methods help to overcome these problems. Such methods can provide automatic indexing and keyword assignment capabilities that are at least as accurate as human indexers in many applications. In addition, NLP systems can increase the information contained in keyword fields by separating keywords into segments, or distinct fields that capture certain discriminating content or relations among keywords. This paper reports on a system that uses natural language text processing to derive keywords from free text news stories, separate these keywords into segments, and automatically build a segmented database. The system is used as part of a commercial news clipping and retrieval product. Preliminary results show improved accuracy, as well as reduced cost, resulting from these automated techniques.}, } @inProceedings{Rennie03, author = {Jason Rennie and Lawrence Shih and Jaime Teevan and David Karger}, title = {Tackling the Poor Assumptions of Naive Bayes Text Classifiers}, booktitle = {Proceedings of ICML-03, 20th International Conference on Machine Learning}, editor = {}, year = {2003}, address = {Washington, DC}, pages = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.ai.mit.edu/~jrennie/papers/icml03-nb.pdf}, abstract = {Naive Bayes is often used as a baseline in text classification because it is fast and easy to implement. Its severe assumptions make such efficiency possible but also adversely affect the quality of its results. In this paper we propose simple, heuristic solutions to some of the problems with Naive Bayes classifiers, addressing both systemic issues as well as problems that arise because text is not actually generated according to a multinomial model. We find that our simple corrections result in a fast algorithm that is competitive with state-of-the-art text classification algorithms such as the Support Vector Machine.}, } @inProceedings{Rennie99, author = {Jason Rennie and Andrew K. McCallum}, title = {Using reinforcement learning to spider the Web efficiently}, booktitle = {Proceedings of ICML-99, 16th International Conference on Machine Learning}, editor = {Ivan Bratko and Saso Dzeroski}, year = {1999}, address = {Bled, SL}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, pages = {335--343}, url = {http://www.watson.org/~jrennie/papers/icml99.ps.gz}, abstract = {Consider the task of exploring the Web in order to find pages of a particular kind or on a particular topic. This task arises in the construction of search engines and Web knowledge bases. The paper argues that the creation of efficient Web spiders is best framed and solved by reinforcement learning, a branch of machine learning that concerns itself with optimal sequential decision making. One strength of reinforcement learning is that it provides a formalism for measuring the utility of actions that give benefit only in the future. We present an algorithm for learning a value function that maps hyperlinks to future discounted reward using a naive Bayes text classifier. Experiments on two real-world spidering tasks show a three-fold improvement in spidering efficiency over traditional breadth-first search, and up to a two-fold improvement over reinforcement learning with immediate reward only.}, } @article{RibeiroNeto01, author = {Berthier Ribeiro-Neto and Alberto H.F. Laender and Luciano R. {De Lima}}, title = {An Experimental Study in Automatically Categorizing Medical Documents}, journal = {Journal of the American Society for Information Science and Technology}, year = {2001}, number = {5}, pages = {391--401}, volume = {52}, url = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76511157&PLACEBO=IE.pdf}, abstract = {In this article, we evaluate the retrieval performance of an algorithm that automatically categorizes medical documents. The categorization, which consists in assigning an International Code of Disease (ICD) to the medical document under examination, is based on well-known information retrieval techniques. The algorithm, which we proposed, operates in a fully automatic mode and requires no supervision or training data. Using a database of 20,569 documents, we verify that the algorithm attains levels of average precision in the 70-80\% range for category coding and in the 60-70\% range for subcategory coding. We also carefully analyze the case of those documents whose categorization is not in accordance with the one provided by the human specialists. The vast majority of them represent cases that can only be fully categorized with the assistance of a human subject (because, for instance, they require specific knowledge of a given pathology). For a slim fraction of all documents (0.77\% for category coding and 1.4\% for subcategory coding), the algorithm makes assignments that are clearly incorrect. However, this fraction corresponds to only one-fourth of the mistakes made by the human specialists.}, } @inProceedings{Riloff92, author = {Ellen Riloff and Wendy Lehnert}, title = {Classifying Texts Using Relevancy Signatures}, booktitle = {Proceedings of AAAI-92, 10th Conference of the American Association for Artificial Intelligence}, publisher = {AAAI Press, Menlo Park, US}, editor = {}, year = {1998}, pages = {329--334}, address = {San Jose, US}, url = {}, abstract = {}, } @inProceedings{Riloff93, author = {Ellen Riloff}, title = {Using Cases to Represent Context for Text Classification}, booktitle = {Proceedings of CIKM-93, 2nd International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Bharat Bhargava and Timothy Finin and Yelena Yesha}, year = {1993}, address = {New York, US}, pages = {105--113}, url = {http://www.cs.utah.edu/~riloff/psfiles/cikm93-w-addend.ps}, abstract = {Research on text classification has typically focused on keyword searches and statistical techniques. Keywords alone cannot always distinguish the relevant from the irrelevant texts and some relevant texts do not contain any reliable keywords at all. Our approach to text classification uses case-based reasoning to represent natural language contexts that can be used to classify texts with extremely high precision. The case base of natural language contexts is acquired automatically during sentence analysis using a training corpus of texts and their correct relevancy classifications. A text is represented as a set of cases and we classify a text as relevant if any of its cases is deemed to be relevant. We rely on the statistical properties of the case base to determine whether similar cases are highly correlated with relevance for the domain. Experiments with the MUC corpus suggest that case-based text classification can achieve very high levels of precision and outperforms our previous algorithms based on relevancy signatures.}, } @article{Riloff94, author = {Ellen Riloff and Wendy Lehnert}, title = {Information extraction as a basis for high-precision text classification}, journal = {ACM Transactions on Information Systems}, year = {1994}, number = {3}, volume = {12}, pages = {296--333}, url = {http://www.cs.utah.edu/~riloff/psfiles/single-acm.ps}, abstract = {We describe an approach to text classification that represents a compromise between traditional word-based techniques and in-depth natural language processing. Our approach uses a natural language processing task called information extraction as a basis for high-precision text classification. We present three algorithms that use varying amounts of extracted information to classify texts. The relevancy signatures algorithm uses linguistic phrases, the augmented relevancy signatures algorithm uses phrases and local context, and the case-based text classification algorithm uses larger pieces of context. Relevant phrases and contexts are acquired automatically using a training corpus. We evaluate the algorithms on the basis of two test sets from the MUC-4 corpus. All three algorithms achieved high precision on both test sets, with the augmented relevancy signatures algorithm and the case-based algorithm reaching 100\% precision with over 60\% recall on one set. In addition, we compare the algorithms on a larger collection of 1700 texts and describe an automated method for empirically deriving appropriate threshold values. The results suggest that information extraction techniques can support high-precision text classification and, in general, using more extracted information improves performance. As a practical matter, we also explain how the text classification system can be easily ported across domains.}, } @phdThesis{Riloff94a, author = {Ellen Riloff}, title = {Information Extraction as a Basis for Portable Text Classification Systems}, school = {Department of Computer Science, University of Massachusetts}, address = {Amherst, US}, year = {1994}, url = {http://www.cs.utah.edu/~riloff/psfiles/single-thesis.ps}, abstract = {Knowledge-based natural language processing systems have achieved good success with many tasks, but they often require many person-months of effort to build an appropriate knowledge base. As a result, they are not portable across domains. This knowledge-engineering bottleneck must be addressed before knowledge-based systems will be practical for real-world applications. This dissertation addresses the knowledge-engineering bottleneck for a natural language processing task called ``information extraction''. A system called AutoSlog is presented which automatically constructs dictionaries for information extraction, given an appropriate training corpus. In the domain of terrorism, AutoSlog created a dictionary using a training corpus and five person-hours of effort that achieved 98\% of the performance of a hand-crafted dictionary that took approximately 1500 person-hours to build. This dissertation also describes three algorithms that use information extraction to support high-precision text classification. As more information becomes available on-line, intelligent information retrieval will be crucial in order to navigate the information highway efficiently and effectively. The approach presented here represents a compromise between keyword-based techniques and in-depth natural language processing. The text classification algorithms classify texts with high accuracy by using an underlying information extraction system to represent linguistic phrases and contexts. Experiments in the terrorism domain suggest that increasing the amount of linguistic context can improve performance. Both AutoSlog and the text classification algorithms are evaluated in three domains: terrorism, joint ventures, and microelectronics. An important aspect of this dissertation is that AutoSlog and the text classification systems can be easily ported across domains.}, } @inProceedings{Riloff95, author = {Ellen Riloff}, title = {Little Words Can Make a Big Difference for Text Classification}, booktitle = {Proceedings of SIGIR-95, 18th ACM International Conference on Research and Development in Information Retrieval}, editor = {Edward A. Fox and Peter Ingwersen and Raya Fidel}, publisher = {ACM Press, New York, US}, year = {1995}, address = {Seattle, US}, pages = {130--136}, url = {http://www.cs.utah.edu/~riloff/psfiles/sigir95.ps}, abstract = {Most information retrieval systems use stopword lists and stemming algorithms. However, we have found that recognizing singular and plural nouns, verb forms, negation, and prepositions can produce dramatically different text classification results. We present results from text classification experiments that compare relevancy signatures, which use local linguistic context, with corresponding indexing terms that do not. In two different domains, relevancy signatures produced better results than the simple indexing terms. These experiments suggest that stopword lists and stemming algorithms may remove or conflate many words that could be used to create more effective indexing terms.}, } @inProceedings{Riloff96, author = {Ellen Riloff}, title = {Using Learned Extraction Patterns for Text Classification}, booktitle = {Connectionist, statistical, and symbolic approaches to learning for natural language processing}, editor = {Stefan Wermter and Ellen Riloff and Gabriele Scheler}, pages = {275--289}, year = {1996}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1040}, url = {http://www.cs.utah.edu/~riloff/psfiles/ijcai-book-chapter.ps}, abstract = {A major knowledge-engineering bottleneck for information extraction systems is the process of constructing an appropriate dictionary of extraction patterns. AutoSlog is a dictionary construction system that has been shown to substantially reduce the time required for knowledge engineering by learning extraction patterns automatically. However, an open question was whether these extraction patterns were useful for tasks other than information extraction. The author describes a series of experiments that show how the extraction patterns learned by AutoSlog can be used for text classification. Three dictionaries produced by AutoSlog for different domains performed well in the author`s text classification experiments.}, } @inCollection{Riloff99, author = {Ellen Riloff and Jeffrey Lorenzen}, title = {Extraction-based Text Categorization: Generating Domain-specific Role Relationships}, booktitle = {Natural language information retrieval}, editor = {Tomek Strzalkowski}, year = {1999}, pages = {167--196}, publisher = {Kluwer Academic Publishers}, address = {Dordrecht, NL}, url = {http://www.cs.utah.edu/~riloff/psfiles/nlp-ir-chapter.ps}, abstract = {In previous work, we developed several algorithms that use information extraction techniques to achieve high-precision text categorization. The relevancy signatures algorithm classifies texts using extraction patterns, and the augmented relevancy signatures algorithm classifies texts using extraction patterns and semantic features associated with role fillers (Riloff and Lehnert, 1994). These algorithms relied on hand-coded training data, including annotated texts and a semantic dictionary. In this chapter, we describe two advances that significantly improve the practicality of our approach. First, we explain how the extraction patterns can be generated automatically using only preclassified texts as input. Second, we present the word-augmented relevancy signatures algorithm that uses lexical items to represent domain-specific role relationships instead of semantic features. Using these techniques, we can automatically build text categorization systems that benefit from domain-specific natural language processing.}, } @article{Robertson84, author = {Stephen E. Robertson and P. Harding}, title = {Probabilistic automatic indexing by learning from human indexers}, year = {1984}, journal = {Journal of Documentation}, volume = {40}, number = {4}, pages = {264--270}, url = {}, abstract = {}, } @inProceedings{Rose02, author = {Tony Rose and Mark Stevenson and Miles Whitehead}, title = {The {Reuters Corpus Volume 1} -- from Yesterday's News to Tomorrow's Language Resources}, booktitle = {Proceedings of LREC-02, 3rd International Conference on Language Resources and Evaluation}, year = {2002}, address = {Las Palmas, ES}, pages = {827--832}, } @inProceedings{Rosso04, author = {Paolo Rosso and Antonio Molina and Ferran Pla and Daniel Jiménez and Vicent Vidal}, title = {Information Retrieval and Text Categorization with Semantic Indexing}, booktitle = {Proceedings of CICLING-04, 5th International Conference on Computational Linguistics and Intelligent Text Processing}, year = {2004}, editor = {Alexander F. Gelbukh}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Seoul, KO}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2945}, pages = {596--600}, url = {}, abstract = {}, } @inProceedings{Roth98, author = {Dan Roth}, title = {Learning to resolve natural language ambiguities: a unified approach}, booktitle = {Proceedings of AAAI-98, 15th Conference of the American Association for Artificial Intelligence}, publisher = {AAAI Press, Menlo Park, US}, editor = {}, year = {1998}, pages = {806--813}, address = {Madison, US}, url = {http://l2r.cs.uiuc.edu/~danr/Papers/aaai98.ps.gz}, abstract = {We analyze a few of the commonly used statistics based and machine learning algorithms for natural language disambiguation tasks and observe that they can be recast as learning linear separators in the feature space. Each of the methods makes a priori assumptions, which it employs, given the data, when searching for its hypothesis. Nevertheless, as we show, it searches a space that is as rich as the space of all linear separators. We use this to build an argument for a data driven approach which merely searches for a good linear separator in the feature space, without further assumptions on the domain or a specific problem. We present such an approach - a sparse network of linear separators, utilizing the Winnow learning algorithm - and show how to use it in a variety of ambiguity resolution problems. The learning approach presented is attribute-efficient and, therefore, appropriate for domains having very large number of attributes. In particular, we present an extensive experimental comparison of our approach with other methods on several well studied lexical disambiguation tasks such as context-sensitive spelling correction, prepositional phrase attachment and part of speech tagging. In all cases we show that our approach either outperforms other methods tried for these tasks or performs comparably to the best.}, } @article{Ruiz02, author = {Miguel Ruiz and Padmini Srinivasan}, title = {Hierarchical text classification using neural networks}, journal = {Information Retrieval}, number = {1}, volume = {5}, pages = {87--118}, year = {2002}, url = {http://www.wkap.nl/article.pdf?383232}, abstract = {This paper presents the design and evaluation of a text categorization method based on the Hierarchical Mixture of Experts model. This model uses a divide and conquer principle to define smaller categorization problems based on a predefined hierarchical structure. The final classifier is a hierarchical array of neural networks. The method is evaluated using the UMLS Metathesaurus as the underlying hierarchical structure, and the OHSUMED test set of MEDLINE records. Comparisons with an optimized version of the traditional Rocchio's algorithm adapted for text categorization, as well as flat neural network classifiers are provided. The results show that the use of the hierarchical structure improves text categorization performance with respect to an equivalent flat model. The optimized Rocchio algorithm achieves a performance comparable with that of the hierarchical neural networks.}, } @inProceedings{Ruiz97, author = {Miguel E. Ruiz and Padmini Srinivasan}, title = {Automatic Text Categorization Using Neural Networks}, booktitle = {Proceedings of the 8th ASIS/SIGCR Workshop on Classification Research}, editor = {Efthimis Efthimiadis}, publisher = {American Society for Information Science, Washington, US}, year = {1997}, address = {Washington, US}, pages = {59--72}, url = {http://www.cs.uiowa.edu/~mruiz/papers/sigcr97/sigcrfinal2.html}, abstract = {This paper presents the results obtained from a series of experiments in automatic text categorization of MEDLINE articles. The main goal of this research is to build neural networks and to train them in assigning MeSH phrases based on term frequency of single words from TITLE and abstract. The experiments compare the performance of a counterpropagation network against a backpropagation neural network. Results obtained by using a set of 2,344 MEDLINE documents are presented and discussed.}, } @inProceedings{Ruiz99, author = {Miguel E. Ruiz and Padmini Srinivasan}, title = {Hierarchical neural networks for text categorization}, booktitle = {Proceedings of SIGIR-99, 22nd ACM International Conference on Research and Development in Information Retrieval}, editor = {Marti A. Hearst and Fredric Gey and Richard Tong}, publisher = {ACM Press, New York, US}, address = {Berkeley, US}, year = {1999}, pages = {281--282}, url = {http://www.acm.org/pubs/articles/proceedings/ir/312624/p281-ruiz/p281-ruiz.pdf}, abstract = {This paper presents the design and evaluation of a text categorization method based on the Hierarchical Mixture of Experts model. This model uses a divide and conquer principle to define smaller categorization problems based on a predefined hierarchical structure. The final classifier is a hierarchical array of neural networks. The method is evaluated using the UMLS Metathesaurus as the underlying hierarchical structure, and the OHSUMED test set of MEDLINE records. Comparisons with traditional Rocchio's algorithm adapted for text categorization, as well as flat neural network classifiers are provided. The results show that the use of the hierarchical structure improves text categorization performance significantly.}, } @inProceedings{Ruiz99a, author = {Miguel E. Ruiz and Padmini Srinivasan}, title = {Combining Machine Learning and Hierarchical Indexing Structures for Text Categorization}, booktitle = {Proceedings of the 10th ASIS/SIGCR Workshop on Classification Research}, editor = {}, publisher = {American Society for Information Science, Washington, US}, year = {1999}, address = {Washington, US}, pages = {}, url = {http://www.cs.uiowa.edu/~mruiz/papers/sigcr_10}, abstract = {This paper presents a method that exploits the hierarchical structure of an indexing vocabulary to guide the development and training of machine learning methods for automatic text categorization. We present the design of a hierarchical classifier based on the divide and conquer principle. The method is evaluated using backpropagation neural networks, as the machine learning algorithm, that learn to assign MeSH categories to a subset of MEDLINE records. Comparisons with traditional Rocchio's algorithm adapted for text categorization, as well as flat neural network classifiers are provided. The results indicate that the use of hierarchical structures improves performance significantly.}, } @article{Sable00, author = {Carl L. Sable and Vasileios Hatzivassiloglou}, title = {Text-based approaches for non-topical image categorization}, journal = {International Journal of Digital Libraries}, year = {2000}, number = {3}, volume = {3}, pages = {261--275}, url = {http://www.cs.columbia.edu/~sable/research/ijodl00.pdf}, abstract = {The rapid expansion of multimedia digital collections brings to the fore the need for classifying not only text documents but their embedded non-textual parts as well. We propose a model for basing classification of multimedia on broad, non-topical features, and show how information on targeted nearby pieces of text can be used to effectively classify photographs on a first such feature, distinguishing between indoor and outdoor images. We examine several variations to a TF*IDF-based approach for this task, empirically analyze their effects, and evaluate our system on a large collection of images from current news newsgroups. In addition, we investigate alternative classification and evaluation methods, and the effects that secondary features have on indoor/outdoor classification. Using density estimation over the raw TF*IDF values, we obtain a classification accuracy of 82\%, a number that outperforms baseline estimates and earlier, image-based approaches, at least in the domain of news articles, and that nears the accuracy of humans who perform the same task with access to comparable information.}, } @inProceedings{Sable01, author = {Carl Sable and Ken Church}, title = {Using Bins to Empirically Estimate Term Weights for Text Categorization}, booktitle = {Proceedings of EMNLP-01, 6th Conference on Empirical Methods in Natural Language Processing}, year = {2001}, publisher = {Association for Computational Linguistics, Morristown, US}, editor = {Lillian Lee and Donna Harman}, pages = {58--66}, address = {Pittsburgh, US}, url = {http://www.cs.columbia.edu/~sable/research/emnlp01.ps}, abstract = {This paper introduces a term weighting method for text categorization based on smoothing ideas borrowed from speech recognition. Empirical estimates of weights (likelihood ratios) become unstable when counts are small. Instead of estimating weights for individual words, as Naive Bayes does, words with similar features are grouped into bins, and a single weight is estimated for each bin. This weight is then assigned to all of the words in the bin. The bin-based method is intended for tasks where there is insufficient training data to estimate a separate weight for each word. Experiments show the bin-based method is highly competitive with other current methods. In particular, this method is most similar to Naive Bayes; it generally performs at least as well as Naive Bayes, and sometimes better.}, } @inProceedings{Sable02, author = {Carl Sable and Kathleen McKeown and Kenneth W. Church}, title = {NLP Found Helpful (at least for one Text Categorization Task)}, booktitle = {Proceedings of EMNLP-02, Conference on Empirical Methods in Natural Language Processing}, address = {Philadelphia, US}, year = {2002}, publisher = {Association for Computational Linguistics}, pages = {172--179}, } @inProceedings{Sable99, author = {Carl L. Sable and Vasileios Hatzivassiloglou}, title = {Text-based approaches for the categorization of images}, booktitle = {Proceedings of ECDL-99, 3rd European Conference on Research and Advanced Technology for Digital Libraries}, editor = {Serge Abiteboul and Anne-Marie Vercoustre}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1696. An extended version appears as~\cite{Sable00}}, year = {1999}, address = {Paris, FR}, pages = {19--38}, url = {http://www.cs.columbia.edu/~sable/research/ecdl99.ps}, abstract = {The rapid expansion of multimedia digital collections brings to the fore the need for classifying not only text documents but their embedded non-textual parts as well. We propose a model for basing classification of multimedia on broad, non-topical features, and show how information on targeted nearby pieces of text can be used to effectively classify photographs on a first such feature, distinguishing between indoor and outdoor images. We examine several variations to a TF*IDF-based approach for this task, empirically analyze their effects, and evaluate our system on a large collection of images from current news newsgroups. In addition, we investigate alternative classification and evaluation methods, and the effect that a secondary feature can have on indoor/outdoor classification. We obtain a classification accuracy of 82\%, a number that clearly outperforms baseline estimates and competing image-based approaches and nears the accuracy of humans who perform the same task with access to comparable information.}, } @inProceedings{Sahami96, author = {Mehran Sahami and Marti A. Hearst and Eric Saund}, title = {Applying the Multiple Cause Mixture Model to Text Categorization}, booktitle = {Proceedings of ICML-96, 13th International Conference on Machine Learning}, editor = {Lorenza Saitta}, year = {1996}, address = {Bari, IT}, pages = {435--443}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://robotics.stanford.edu/users/sahami/papers-dir/ml96-mcmm.ps}, abstract = {The paper introduces the use of the Multiple Cause Mixture Model for automatic text category assignment. Although much research has been done on text categorization, this algorithm is novel in that it is unsupervised, i.e., it does not require pre-labeled training examples, and it can assign multiple category labels to documents. We present very preliminary results of the application of this model to a standard test collection, evaluating it in supervised mode in order to facilitate comparison with other methods, and showing initial results of its use in unsupervised mode.}, } @proceedings{Sahami98a, editor = {Mehran Sahami}, title = {Proceedings of the 1998 Workshop on Learning for Text Categorization}, institution = {Americal Association for Artificial Intelligence}, note = {Available as Technical Report WS-98-05}, address = {Madison, US}, year = {1998}, url = {}, } @inProceedings{Sahami98b, author = {Mehran Sahami and Salim Yusufali and Michelle Q. Baldonado}, title = {SONIA: a service for organizing networked information autonomously}, booktitle = {Proceedings of DL-98, 3rd ACM Conference on Digital Libraries}, editor = {Ian Witten and Rob Akscyn and Frank M. Shipman}, publisher = {ACM Press, New York, US}, year = {1998}, address = {Pittsburgh, US}, pages = {200--209}, url = {http://robotics.stanford.edu/users/sahami/papers-dir/dl98-sonia.ps}, abstract = {The recent explosion of online information in digital libraries and on the World Wide Web has given rise to a number of query-based search engines and manually constructed topical hierarchies. However, these tools are quickly becoming inadequate as query results grow incomprehensibly large and manual classification in topic hierarchies creates an immense bottleneck. We address these problems with a system for topical information space navigation that combines the query-based and taxonomic systems. We employ machine learning techniques to create dynamic document categorizations based on the full-text of articles that are retrieved in response to users' queries. Our system, named SONIA (Service for Organizing Networked Information Autonomously), has been implemented as part of the Stanford Digital Libraries Testbed. It employs a combination of technologies that takes the results of queries to networked information sources and, in real-time, automatically retrieve, parse and organize these documents into coherent categories for presentation to the user. Moreover, the system can then save such document organizations in user profiles which can then be used to help classify future query results by the same user. SONIA uses a multi-tier approach to extracting relevant terms from documents as well as statistical clustering methods to determine potential topics within a document collection. It also makes use of Bayesian classification techniques to classify new documents within an existing categorization scheme. In this way, it allows users to navigate the results of a query at a more topical level rather than having to examine each document text separately.}, } @article{Sakakibara96, author = {Yasubumi Sakakibara and Kazuo Misue and Takeshi Koshiba}, title = {A machine learning approach to knowledge acquisitions from text databases}, year = {1996}, journal = {International Journal of Human Computer Interaction}, volume = {8}, number = {3}, pages = {309--324}, url = {}, abstract = {The rapid growth of data in large databases, such as text databases and scientific databases, requires efficient computer methods for automating analyses of the data with the goal of acquiring knowledges or making discoveries. Because the analyses of data are generally so expensive, most parts in databases remains as raw, unanalyzed primary data. Technology from machine learning (ML) will offer efficient tools for the intelligent analyses of the data using generalization ability. Generalization is an important ability specific to inductive learning that will predict unseen data with high accuracy based on learned concepts from training examples. In this article, we apply ML to text-database analyses and knowledge acquisitions from text databases. We propose a completely new approach to the problem of text classification and extracting keywords by using ML techniques. We introduce a class of representations for classifying text data based on decision trees; (i.e., decision trees over attributes on strings) and present an algorithm for learning them inductively. Our algorithm has the following features: It does not need any natural language processing technique and it is robust for noisy data. We show that our learning algorithm can be used for automatic extraction of keywords for text retrieval and automatic text categorization. We also demonstrate some experimental results using our algorithm on the problem of classifying bibliographic data and extracting keywords in order to show the effectiveness of our approach.}, } @inProceedings{Sakkis01, author = {Georgios Sakkis and Ion Androutsopoulos and Georgios Paliouras and Vangelis Karkaletsis and Constantine D. Spyropoulos and Panagiotis Stamatopoulos}, title = {Stacking Classifiers for Anti-Spam Filtering of E-Mail}, booktitle = {Proceedings of EMNLP-01, 6th Conference on Empirical Methods in Natural Language Processing}, year = {2001}, publisher = {Association for Computational Linguistics, Morristown, US}, editor = {Lillian Lee and Donna Harman}, pages = {44--50}, address = {Pittsburgh, US}, url = {http://arXiv.org/pdf/cs/0106040}, abstract = {We evaluate empirically a scheme for combining classifiers, known as stacked generalization, in the context of anti-spam filtering, a novel cost-sensitive application of text categorization. Unsolicited commercial e-mail, or "spam", floods mailboxes, causing frustration, wasting bandwidth, and exposing minors to unsuitable content. Using a public corpus, we show that stacking can improve the efficiency of automatically induced anti-spam filters, and that such filters can be used in real-life applications.}, } @article{Sakkis03, author = {Georgios Sakkis and Ion Androutsopoulos and Georgios Paliouras and Vangelis Karkaletsis and Constantine D. Spyropoulos and Panagiotis Stamatopoulos}, title = {A Memory-Based Approach to Anti-Spam Filtering for Mailing Lists}, journal = {Information Retrieval}, publisher = {Kluwer Academic Publishers}, issn = {1386-4564}, number = {1}, volume = {6}, pages = {49--73}, year = {2003}, url = {http://www.kluweronline.com/issn/1386-4564}, abstract = {This paper presents an extensive empirical evaluation of memory-based learning in the context of anti-spam filtering, a novel cost-sensitive application of text categorization that attempts to identify automatically unsolicited commercial messages that flood mailboxes. Focusing on anti-spam filtering for mailing lists, a thorough investigation of the effectiveness of a memory-based anti-spam filter is performed using a publicly available corpus. The investigation includes different attribute and distance-weighting schemes, and studies on the effect of the neighborhood size, the size of the attribute set, and the size of the training corpus. Three different cost scenarios are identified, and suitable cost-sensitive evaluation functions are employed. We conclude that memory-based anti-spam filtering for mailing lists is practically feasible, especially when combined with additional safety nets. Compared to a previously tested Naive Bayes filter, the memory-based filter performs on average better, particularly when the misclassification cost for non-spam messages is high.}, } @inProceedings{Sasaki98, author = {Minoru Sasaki and Kenji Kita}, title = {Automatic text categorization based on hierarchical rules}, booktitle = {Proceedings of the 5th International Conference on Soft Computing and Information}, publisher = {World Scientific, Singapore, SN}, address = {Iizuka, JP}, year = {1998}, pages = {935--938}, url = {http://www-a2k.is.tokushima-u.ac.jp/member/sasaki/frame_home/Papers/IIZUKA98.ps}, abstract = {Document categorization, which is defined as the classification of text documents into one of several fixed classes or categories, has become important with the explosive growth of the World Wide Web. The goal of the work described in this paper is to automatically categorize Web documents in order to enable effective retrieval of Web information. In this paper, based on the rule learning algorithm RIPPER (Repeated Incremental Pruning to Produce Error Reduction), we propose an efficient method for hierarchical document categorization.}, } @inProceedings{Sasaki98a, author = {Minoru Sasaki and Kenji Kita}, title = {Rule-based text categorization using hierarchical categories}, booktitle = {Proceedings of SMC-98, IEEE International Conference on Systems, Man, and Cybernetics}, editor = {}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, address = {La Jolla, US}, year = {1998}, pages = {2827--2830}, url = {http://www-a2k.is.tokushima-u.ac.jp/member/sasaki/frame_home/Papers/SMC.ps}, abstract = {Document categorization, which is defined as the classification of text documents into one of several fixed classes or categories, has become important with the explosive growth of the World Wide Web. The goal of the work described here is to automatically categorize Web documents in order to enable effective retrieval of Web information. In this paper, based on the rule learning algorithm RIPPER (for Repeated Incremental Pruning to Produce Error Reduction), we propose an efficient method for hierarchical document categorization.}, } @article{Schapire00, author = {Schapire, Robert E. and Singer, Yoram}, title = {{{\sc BoosTexter}}: a boosting-based system for text categorization}, journal = {Machine Learning}, year = {2000}, number = {2/3}, volume = {39}, pages = {135-168}, url = {http://www.research.att.com/~schapire/papers/SchapireSi98b.ps.Z}, abstract = {This work focuses on algorithms which learn from examples to perform multiclass text and speech categorization tasks. Our approach is based on a new and improved family of boosting algorithms. We describe in detail an implementation, called BoosTexter, of the new boosting algorithms for text categorization tasks. We present results comparing the performance of BoosTexter and a number of other text-categorization algorithms on a variety of tasks. We conclude by describing the application of our system to automatic call-type identification from unconstrained spoken customer responses.}, } @inProceedings{Schapire98, author = {Schapire, Robert E. and Singer, Yoram and Singhal, Amit}, title = {Boosting and {Rocchio} applied to text filtering}, booktitle = {Proceedings of SIGIR-98, 21st ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Alistair Moffat and Van Rijsbergen, Cornelis J. and Ross Wilkinson and Justin Zobel}, publisher = {ACM Press, New York, US}, year = {1998}, address = {Melbourne, AU}, pages = {215--223}, url = {http://www.research.att.com/~schapire/cgi-bin/uncompress-papers/SchapireSiSi98.ps}, abstract = {We discuss two learning algorithms for text filtering: modified Rocchio and a boosting algorithm called AdaBoost. We show how both algorithms can be adapted to maximize any general utility matrix that associates cost (or gain) for each pair of machine prediction and correct label. We first show that AdaBoost significantly outperforms another highly effective text filtering algorithm. We then compare AdaBoost and Rocchio over three large text filtering tasks. Overall both algorithms are comparable and are quite effective. AdaBoost produces better classifiers than Rocchio when the training collection contains a very large number of relevant documents. However, on these tasks, Rocchio runs much faster than AdaBoost.}, } @inProceedings{Scheffer99, author = {Tobias Scheffer and Thorsten Joachims}, title = {Expected error analysis for model selection}, booktitle = {Proceedings of ICML-99, 16th International Conference on Machine Learning}, editor = {Ivan Bratko and Saso Dzeroski}, year = {1999}, address = {Bled, SL}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, pages = {361-370}, url = {http://www-ai.cs.uni-magdeburg.de/~scheffer/papers/icml99.ps}, abstract = {In order to select a good hypothesis language (or model) from a collection of possible models, one has to assess the generalization performance of the hypothesis which is returned by a learner that is bound to use that model. The paper deals with a new and very efficient way of assessing this generalization performance. We present an analysis which characterizes the expected generalization error of the hypothesis with least training error in terms of the distribution of error rates of the hypotheses in the model. This distribution can be estimated very efficiently from the data which immediately leads to an efficient model selection algorithm. The analysis predicts learning curves with a very high precision and thus contributes to a better understanding of why and when over-fitting occurs. We present empirical studies (controlled experiments on Boolean decision trees and a large-scale text categorization problem) which show that the model selection algorithm leads to error rates which are often as low as those obtained by 10-fold cross validation (sometimes even lower). However, the algorithm is much more efficient (because the learner does not have to be invoiced at all) and thus solves model selection problems with as many as a thousand relevant attributes and 12000 examples.}, } @inProceedings{Schneider03, author = {{Karl-Michael} Schneider}, year = {2003}, title = {A Comparison of Event Models for Naive Bayes Anti-Spam E-Mail Filtering}, pages = {}, address = {}, editor = {}, booktitle = {Proceedings of EACL-03, 11th Conference of the European Chapter of the Association for Computational Linguistics}, url = {http://www.phil.uni-passau.de/linguistik/mitarbeiter/schneider/pub/eacl2003.pdf}, abstract = {}, } @inProceedings{Schutze95, author = {Hinrich Sch{\"{u}}tze and David A. Hull and Jan O. Pedersen}, title = {A comparison of classifiers and document representations for the routing problem}, booktitle = {Proceedings of SIGIR-95, 18th ACM International Conference on Research and Development in Information Retrieval}, editor = {Edward A. Fox and Peter Ingwersen and Raya Fidel}, publisher = {ACM Press, New York, US}, year = {1995}, address = {Seattle, US}, pages = {229--237}, url = {ftp://parcftp.xerox.com/pub/qca/papers/sigir95.ps.gz}, abstract = {In this paper, we compare learning techniques based on statistical classification to traditional methods of relevance feedback for the document routing problem. We consider three classification techniques which have decision rules that are derived via explicit error minimization: linear discriminant analysis, logistic regression, and neural networks. We demonstrate that the classifiers perform 10-15\% better than relevance feedback via Rocchio expansion for the TREC-2 and TREC-3 routing tasks. Error minimization is difficult in high-dimensional feature spaces because the convergence process is slow and the models are prone to overfitting. We use two different strategies, latent semantic indexing and optimal term selection, to reduce the number of features. Our results indicate that features based on latent semantic indexing are more effective for techniques such as linear discriminant analysis and logistic regression, which have no way to protect against overfitting. Neural networks perform equally well with either set of features and can take advantage of the additional information available when both feature sets are used as input.}, } @article{Schutze98, author = {Hinrich Sch{\"{u}}tze}, title = {Automatic word sense discrimination}, journal = {Computational Linguistics}, year = {1998}, volume = {24}, number = {1}, pages = {97--124}, url = {}, abstract = {This paper presents context-group discrimination, a disambiguation algorithm based on clustering. Senses are interpreted as groups (or clusters) of similar contexts of the ambiguous word. Words, contexts and senses are represented in Word Space, a high-dimensional real-valued space in which closeness corresponds to semantic similarity. Similarity in Word Space is based on second-order co-occurrence: two tokens (or contexts) of the ambiguous word are assigned to the same sense cluster if the words they co-occur with in turn occur with similar words in a training corpus. The algorithm is automatic and unsupervised in both training and application: senses are induced from a corpus without labeled training instances or other external knowledge sources. The paper demonstrates good performance of context-group discrimination for a sample of natural and artificial ambiguous words.}, } @mastersThesis{Scott98, author = {Sam Scott}, title = {Feature Engineering for a Symbolic Approach to Text Classification}, school = {Computer Science Department, University of Ottawa}, address = {Ottawa, CA}, year = {1998}, url = {http://ai.iit.nrc.ca/II_public/Classification/thesis.pdf}, abstract = {}, } @inProceedings{Scott99, author = {Sam Scott and Stan Matwin}, title = {Feature engineering for text classification}, booktitle = {Proceedings of ICML-99, 16th International Conference on Machine Learning}, editor = {Ivan Bratko and Saso Dzeroski}, year = {1999}, address = {Bled, SL}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, pages = {379--388}, url = {}, abstract = {Most research in text classification to date has used a ``bag of words'' representation in which each feature corresponds to a single word. The paper examines some alternative ways to represent text based on syntactic and semantic relationships between words (phrases, synonyms and hypernyms). We describe the new representations and try to justify our hypothesis that they could improve the performance of a rule based learner. The representations are evaluated using the RIPPER learning algorithm on the Reuters-21578 and DigiTrad test corpora. On their own, the new representations are not found to produce significant performance improvements. We also try combining classifiers based on different representations using a majority voting technique, and this improves performance on both test collections. In our opinion, more sophisticated natural language processing techniques need to be developed before better text representations can be produced for classification.}, } @inProceedings{Sebastiani00, author = {Fabrizio Sebastiani and Alessandro Sperduti and Nicola Valdambrini}, title = {An improved boosting algorithm and its application to automated text categorization}, booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on Information and Knowledge Management}, address = {McLean, US}, editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner}, publisher = {ACM Press, New York, US}, year = {2000}, pages = {78--85}, url = {http://www.math.unipd.it/~fabseb60/Publications/CIKM00.pdf}, abstract = {We describe {\sc AdaBoost.MH$^KR$}, an improved boosting algorithm, and its application to text categorization. Boosting is a method for supervised learning which has successfully been applied to many different domains, and that has proven one of the best performers in text categorization exercises so far. Boosting is based on the idea of relying on the collective judgment of a committee of classifiers that are trained sequentially. In training the $i$-th classifier special emphasis is placed on the correct categorization of the training documents which have proven harder for the previously trained classifiers. {\sc AdaBoost.MH$^KR$} is based on the idea to build, at every iteration of the learning phase, not a single classifier but a sub-committee of the $K$ classifiers which, at that iteration, look the most promising. We report the results of systematic experimentation of this method performed on the standard {\sf Reuters-21578} benchmark. These experiments have shown that {\sc AdaBoost.MH$^KR$} is both more efficient to train and more effective than the original {\sc AdaBoost.MH$^R$} algorithm.}, } @article{Sebastiani02, author = {Fabrizio Sebastiani}, title = {Machine learning in automated text categorization}, journal = {ACM Computing Surveys}, volume = {34}, number = {1}, pages = {1--47}, year = {2002}, url = {http://www.math.unipd.it/~fabseb60/Publications/ACMCS02.pdf}, abstract = {The automated categorization (or classification) of texts into predefined categories has witnessed a booming interest in the last ten years, due to the increased availability of documents in digital form and the ensuing need to organize them. In the research community the dominant approach to this problem is based on machine learning techniques: a general inductive process automatically builds a classifier by learning, from a set of preclassified documents, the characteristics of the categories. The advantages of this approach over the knowledge engineering approach (consisting in the manual definition of a classifier by domain experts) are a very good effectiveness, considerable savings in terms of expert manpower, and straightforward portability to different domains. This survey discusses the main approaches to text categorization that fall within the machine learning paradigm. We will discuss in detail issues pertaining to three different problems, namely document representation, classifier construction, and classifier evaluation.}, } @inCollection{Sebastiani05, author = {Fabrizio Sebastiani}, title = {Text Categorization}, editor = {Alessandro Zanasi}, year = {2005}, booktitle = {Text Mining and its Applications to Intelligence, CRM and Knowledge Management}, pages = {109--129}, publisher = {WIT Press}, address = {Southampton, UK}, url = {http://www.math.unipd.it/~fabseb60//Publications/TM05.pdf}, abstract = {Text categorization (also known as text classification, or topic spotting) is the task of automatically sorting a set of documents into categories from a predefined set. This task has several applications, including automated indexing of scientific articles according to predefined thesauri of technical terms, filing patents into patent directories, selective dissemination of information to information consumers, automated population of hierarchical catalogues of Web resources, spam filtering, identification of document genre, authorship attribution, survey coding, and even automated essay grading. Automated text classification is attractive because it frees organizations from the need of manually organizing document bases, which can be too expensive, or simply infeasible given the time constraints of the application or the number of documents involved. The accuracy of modern text classification systems rivals that of trained human professionals, thanks to a combination of information retrieval (IR) technology and machine learning (ML) technology. This chapter will outline the fundamental traits of the technologies involved, of the applications that can feasibly be tackled through text classification, and of the tools and resources that are available to the researcher and developer wishing to take up these technologies for deploying real-world applications.}, } @inCollection{Sebastiani05a, author = {Fabrizio Sebastiani}, title = {Text Categorization}, editor = {Laura C. Rivero and Jorge H. Doorn and Viviana E. Ferraggine}, year = {2005}, booktitle = {The Encyclopedia of Database Technologies and Applications}, publisher = {Idea Group Publishing}, address = {Hershey, {US}}, pages = {683--687}, url = {http://www.math.unipd.it/~fabseb60//Publications/EDTA05.pdf}, } @inCollection{Sebastiani06, author = {Fabrizio Sebastiani}, title = {Classification of text, automatic}, editor = {Keith Brown}, year = {2005}, volume = {14}, booktitle = {The Encyclopedia of Language and Linguistics}, publisher = {Elsevier Science Publishers}, address = {Amsterdam, NL}, pages = {}, url = {http://www.math.unipd.it/~fabseb60//Publications/ELL06.pdf}, edition = {Second}, } @inProceedings{Sebastiani99, author = {Fabrizio Sebastiani}, title = {A Tutorial on Automated Text Categorisation}, booktitle = {Proceedings of ASAI-99, 1st Argentinian Symposium on Artificial Intelligence}, editor = {Analia Amandi and Ricardo Zunino}, year = {1999}, address = {Buenos Aires, AR}, pages = {7--35}, url = {http://www.math.unipd.it/~fabseb60/Publications/ASAI99.pdf}, note = {An extended version appears as~\cite{Sebastiani02}}, abstract = {The automated categorisation (or classification) of texts into topical categories has a long history, dating back at least to 1960. Until the late '80s, the dominant approach to the problem involved knowledge-engineering automatic categorisers, i.e. manually building a set of rules encoding expert knowledge on how to classify documents. In the '90s, with the booming production and availability of on-line documents, automated text categorisation has witnessed an increased and renewed interest. A newer paradigm based on machine learning has superseded the previous approach. Within this paradigm, a general inductive process automatically builds a classifier by ``learning'', from a set of previously classified documents, the characteristics of one or more categories; the advantages are a very good effectiveness, a considerable savings in terms of expert manpower, and domain independence. In this tutorial we look at the main approaches that have been taken towards automatic text categorisation within the general machine learning paradigm. Issues of document indexing, classifier construction, and classifier evaluation, will be touched upon.}, } @article{Selamat04, author = {Ali Selamat and Sigeru Omatu}, title = {Web page feature selection and classification using neural networks}, journal = {Information Sciences}, year = {2004}, number = {1}, volume = {158}, pages = {69--88}, url = {http://dx.doi.org/10.1016/j.ins.2003.03.003}, abstract = {Automatic categorization is the only viable method to deal with the scaling problem of the World Wide Web (WWW). In this paper, we propose a news web page classification method (WPCM). The WPCM uses a neural network with inputs obtained by both the principal components and class profile-based features. Each news web page is represented by the term-weighting scheme. As the number of unique words in the collection set is big, the principal component analysis (PCA) has been used to select the most relevant features for the classification. Then the final output of the PCA is combined with the feature vectors from the class-profile which contains the most regular words in each class. We have manually selected the most regular words that exist in each class and weighted them using an entropy weighting scheme. The fixed number of regular words from each class will be used as a feature vectors together with the reduced principal components from the PCA. These feature vectors are then used as the input to the neural networks for classification. The experimental evaluation demonstrates that the WPCM method provides acceptable classification accuracy with the sports news datasets.}, } @inProceedings{Sevillano04, author = {Sevillano Dominguez, Xavier and Alias Pujol, Francesc and Socoro Carrie, Joan C.}, title = {ICA-based hierarchical text classification for multi-domain text-to-speech synthesis}, booktitle = {Proceedings of ICASSP-04, Proceedings of the 29th IEEE International Conference on Acoustics, Speech, and Signal Processing}, editor = {}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, address = {Montreal, CA}, year = {2000}, pages = {697--700}, volume = {5}, url = {}, abstract = {In the framework of multi-domain Text-to-Speech synthesis it is essential to (i) design a hierarchically structured database for allowing several domains in the same speech corpus and (ii) include a text classification module that, at run time, assigns the input sentences to a domain or set of domains from the database. In this paper, we present a hierarchical text classifier based on Independent Component Analysis (ICA), which is capable of (i) organizing the contents of the corpus in a hierarchical manner and (ii) classifying the texts to be synthesized according to the learned structure. The document organization and classification performance of our ICA-based hierarchical classifier are evaluated in several encouraging experiments conducted on a journalistic-style text corpus for speech synthesis in Catalan.}, } @inProceedings{Shanahan03, author = {James G. Shanahan and Norbert Roma}, title = {Boosting support vector machines for text classification through parameter-free threshold relaxation}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {247--254}, url = {http://doi.acm.org/10.1145/956863.956911}, abstract = {Support vector machine (SVM) learning algorithms focus on finding the hyperplane that maximizes the margin (the distance from the separating hyperplane to the nearest examples) since this criterion provides a good upper bound of the generalization error. When applied to text classification, these learning algorithms lead to SVMs with excellent precision but poor recall. Various relaxation approaches have been proposed to counter this problem including: asymmetric SVM learning algorithms (soft SVMs with asymmetric misclassification costs); uneven margin based learning; and thresholding. A review of these approaches is presented here. In addition, in this paper, we describe a new threshold relaxation algorithm. This approach builds on previous thresholding work based upon the beta-gamma algorithm. The proposed thresholding strategy is parameter free, relying on a process of retrofitting and cross validation to set algorithm parameters empirically, whereas our previous approach required the specification of two parameters (beta and gamma). The proposed approach is more efficient, does not require the specification of any parameters, and similarly to the parameter-based approach, boosts the performance of baseline SVMs by at least 20\% for standard information retrieval measures.}, } @inProceedings{Shanks03, author = {Vaughan R. Shanks and Hugh E. Williams}, title = {Index construction for linear categorisation}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {334--341}, url = {http://doi.acm.org/10.1145/956863.956926}, abstract = {Categorisation is a useful method for organising documents into subcollections that can be browsed or searched to more accurately and quickly meet information needs. On the Web, category-based portals such as Yahoo! and DMOZ are extremely popular: DMOZ is maintained by over 56,000 volunteers, is used as the basis of the popular Google directory, and is perhaps used by millions of users each day. Support Vector Machines (SVM) is a machine-learning algorithm which has been shown to be highly effective for automatic text categorisation. However, a problem with iterative training techniques such as SVM is that during their learning or training phase, they require the entire training collection to be held in main-memory; this is infeasible for large training collections such as DMOZ or large news wire feeds. In this paper, we show how inverted indexes can be used for scalable training in categorisation, and propose novel heuristics for a fast, accurate, and memory efficient approach. Our results show that an index can be constructed on a desktop workstation with little effect on categorisation accu-racy compared to a memory-based approach. We conclude that our techniques permit automatic categorisation using very large train-ing collections, vocabularies, and numbers of categories.}, } @article{Shin01, author = {Christian Shin and David Doermann and Azriel Rosenfeld}, title = {Classification of document pages using structure-based features}, journal = {International Journal on Document Analysis and Recognition}, number = {4}, volume = {3}, pages = {232--247}, year = {2001}, url = {http://link.springer.de/link/service/journals/10032/papers/1003004/10030232.pdf}, abstract = {Searching for documents by their type or genre is a natural way to enhance the effectiveness of document retrieval. The layout of a document contains a significant amount of information that can be used to classify it by type in the absence of domain-specific models. Our approach to classification is based on "visual similarity" of layout structure and is implemented by building a supervised classifier, given examples of each class. We use image features such as percentages of text and non-text (graphics, images, tables, and rulings) content regions, column structures, relative point sizes of fonts, density of content area, and statistics of features of connected components which can be derived without class knowledge. In order to obtain class labels for training samples, we conducted a study where subjects ranked document pages with respect to their resemblance to representative page images. Class labels can also be assigned based on known document types, or can be defined by the user. We implemented our classification scheme using decision tree classifiers and self-organizing maps.}, } @inProceedings{Siersdorfer04, author = {Stefan Siersdorfer and Sergej Sizov and Gerhard Weikum}, title = {Goal-oriented Methods and Meta Methods for Document Classification and their Parameter Tuning}, booktitle = {Proceedings of CIKM-04, 13th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, address = {Washington, US}, editor = {David A. Evans and Luis Gravano and Otthein Herzog and ChengXiang Zhai and Marc Ronthaler}, year = {2004}, pages = {59--68}, url = {}, abstract = {}, } @inProceedings{Siersdorfer05, author = {Stefan Siersdorfer and Gerhard Weikum}, title = {Using restrictive classification and meta classification for junk elimination}, booktitle = {Proceedings of ECIR-05, 27th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {David E. Losada and Juan M. Fern{'{a}}ndez-Luna}, address = {Santiago De Compostela, ES}, year = {2005}, pages = {287--299}, url = {}, abstract = {}, } @inProceedings{Siolas00, author = {Siolas, Georges and d'Alche-Buc, Florence}, title = {Support Vector Machines based on a semantic kernel for text categorization}, booktitle = {Proceedings of IJCNN-00, 11th International Joint Conference on Neural Networks}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {Amari, Shun-Ichi and Giles, C. Lee and Gori, Marco and Piuri, Vincenzo}, year = {2000}, address = {Como, IT}, volume = {5}, pages = {205--209}, url = {http://dlib.computer.org/conferen/ijcnn/0619/pdf/06193581.pdf}, abstract = {We propose to solve a text categorization task using a new metric between documents, based on a priori semantic knowledge about words. This metric can be incorporated into the definition of radial basis kernels of Support Vector Machines or directly used in a K-nearest neighbors algorithm. Both SVM and KNN are tested and compared on the 20-newsgroups database. Support Vector Machines provide the best accuracy on test data.}, } @article{Skarmeta00, author = {Antonio G\'omez Skarmeta and Amine Bensaid and Nadia Tazi}, title = {Data mining for text categorization with semi-supervised agglomerative hierarchical clustering}, journal = {International Journal of Intelligent Systems}, year = {2000}, number = {7}, volume = {15}, pages = {633--646}, url = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=72502965&PLACEBO=IE.pdf}, abstract = {In this paper we study the use of a semi-supervised agglomerative hierarchical clustering (ssAHC) algorithm to text categorization, which consists of assigning text documents to predefined categories. ssAHC is (i) a clustering algorithm that (ii) uses a finite design set of labeled data to (iii) help agglomerative hierarchical clustering (AHC) algorithms partition a finite set of unlabeled data and then (iv) terminates without the capability to label other objects. We first describe the text representation method we use in this work; we then present a feature selection method that is used to reduce the dimensionality of the feature space. Finally, we apply the ssAHC algorithm to the Reuters database of documents and show that its performance is superior to the Bayes classifier and to the Expectation-Maximization algorithm combined with Bayes classifier. We showed also that ssAHC helps AHC techniques to improve their performance.}, } @inProceedings{Slattery00, author = {Se{\'{a}}n Slattery and Mark Craven}, title = {Discovering test set regularities in relational domains}, booktitle = {Proceedings of ICML-00, 17th International Conference on Machine Learning}, editor = {Pat Langley}, year = {2000}, address = {Stanford, US}, pages = {895--902}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cmu.edu/~sean/papers/icml2000.ps}, abstract = {Machine learning typically involves discovering regularities in a training set, then applying these learned regularities to classify objects in a test set. In this paper we present an approach to discovering additional regularities in the test set, and show that in relational domains such test set regularities can be used to improve classification accuracy beyond that achieved using the training set alone. For example, we have previously shown how FOIL, a relational learner, can learn to classify Web pages by discovering training set regularities in the words occurring on target pages, and on other pages related by hyperlinks. Here we show how the classification accuracy of FOIL on this task can be improved by discovering additional regularities on the test set pages that must be classified. Our approach can be seen as an extension to Kleinberg's Hubs and Authorities algorithm that analyzes hyperlink relations among Web pages. We present evidence that this new algorithm leads to better test set precision and recall on three binary Web classification tasks where the test set Web pages are taken from different Web sites than the training set.}, } @inProceedings{Slattery98, author = {Se{\'{a}}n Slattery and Mark Craven}, title = {Combining Statistical and Relational Methods for Learning in Hypertext Domains}, booktitle = {Proceedings of ILP-98, 8th International Conference on Inductive Logic Programming}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1446}, editor = {David Page}, year = {1998}, pages = {38--52}, address = {Madison, US}, url = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/ilp98.ps.gz}, abstract = {We present a new approach to learning hypertext classifiers that combines a statistical text-learning method with a relational rule learner. This approach is well suited to learning in hypertext domains because its statistical component allows it to characterize text in terms of word frequencies, whereas its relational component is able to describe how neighboring documents are related to each other by hyperlinks that connect them. We evaluate our approach by applying it to tasks that involve learning definitions for (i) classes of pages, (ii) particular relations that exist between pairs of pages, and (iii) locating a particular class of information in the internal structure of pages. Our experiments demonstrate that this new approach is able to learn more accurate classifiers than either of its constituent methods alone.}, } @inProceedings{Slonim01, author = {Noam Slonim and Naftali Tishby}, title = {The Power of Word Clusters for Text Classification}, booktitle = {Proceedings of ECIR-01, 23rd European Colloquium on Information Retrieval Research}, editor = {}, year = {2001}, address = {Darmstadt, DE}, publisher = {}, pages = {}, url = {http://www.cs.huji.ac.il/labs/learning/Papers/irsg3.eps.gz}, abstract = {The recently introduced Information Bottleneck method provides an information theoretic framework, for extracting features of one variable, that are relevant for the values of another variable. Several previous works already suggested applying this method for document clustering, gene expression data analysis, spectral analysis and more. In this work we present a novel implementation of this method for supervised text classification. Specifically, we apply the information bottleneck method to find word-clusters that preserve the information about document categories and use these clusters as features for classification. Previous work used a similar clustering procedure to show that word-clusters can significantly reduce the feature space dimensionality, with only a minor change in classification accuracy. In this work we reproduce these results and go further to show that when the training sample is small word clusters can yield significant improvement in classification accuracy (up to 18\%) over the performance using the words directly.}, } @inProceedings{Soucy01, author = {Pascal Soucy and Guy W. Mineau}, title = {A Simple Feature Selection Method for Text Classification}, booktitle = {Proceeding of IJCAI-01, 17th International Joint Conference on Artificial Intelligence}, editor = {Bernhard Nebel}, address = {Seattle, US}, year = {2001}, pages = {897--902}, url = {}, abstract = {In text classification most techniques use bag-of-words to represent documents. The main problem is to identify what words are best suited to classify the documents in such a way as to discriminate between them. Feature selection techniques are then needed to identify these words. The feature selection method presented in this paper is rather simple and computationally efficient. It combines a well known feature selection criterion, the information gain, and a new algorithm that selects and adds a feature to a bag-of-words if it does not occur too often with the features already in a small set composed of the best features selected so far for their high information gain. In brief, it tries to avoid considering features whose discrimination capability is sufficiently covered by already selected features, reducing in size the set of the features used to characterize the document set. This paper presents this feature selection method and its results, and how we have predetermined some of its parameters through experimentation.}, } @inProceedings{Soucy01a, author = {Pascal Soucy and Guy W. Mineau}, title = {A Simple KNN Algorithm for Text Categorization}, booktitle = {Proceedings of ICDM-01, IEEE International Conference on Data Mining}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {Nick Cercone and Tsau Y. Lin and Xindong Wu}, year = {2001}, address = {San Jose, CA}, pages = {647--648}, url = {}, abstract = {}, } @inProceedings{Soucy03, author = {Pascal Soucy and Guy W. Mineau}, title = {Feature Selection Strategies for Text Categorization}, booktitle = {Proceeding of CSCSI-03, 16th Conference of the Canadian Society for Computational Studies of Intelligence}, editor = {Yang Xiang and Brahim Chaib-Draa}, address = {Halifax, CA}, year = {2003}, pages = {505--509}, url = {}, abstract = {}, } @inProceedings{Spitz00, author = {Larry Spitz and Arman Maghbouleh}, title = {Text categorization using character shape codes}, booktitle = {Proceedings of the 7th SPIE Conference on Document Recognition and Retrieval}, publisher = {SPIE, The International Society for Optical Engineering}, editor = {Daniel P. Lopresti and Jiangying Zhou}, year = {2000}, address = {San Jose, US}, pages = {174--181}, url = {}, abstract = {Text categorization in the form of topic identification is a capability of current interest. The paper is concerned with categorization of electronic document images. Previous work on the categorization of document images has relied on optical character recognition (OCR) to provide the transformation between the image domain and a domain where pattern recognition techniques are more readily applied. Our work uses a different technology to provide this transformation. Character shape coding is a computationally efficient, extraordinarily robust means of providing access to the character content of document images. While this transform is lossy, sufficient salient information is retained to support many applications. Furthermore, the use of shape coding is particularly advantageous over OCR in the processing of page images of poor quality. The authors found that topic identification performance was maintained or slightly improved using character shape codes derived from images.}, } @article{Stamatatos00, author = {Efstathios Stamatatos and Nikos Fakotakis and George Kokkinakis}, title = {Automatic text categorization in terms of genre and author}, journal = {Computational Linguistics}, pages = {471--495}, year = {2000}, number = {4}, volume = {26}, url = {}, abstract = {The two main factors that characterize a text are its content and its style, and both can be used as a means of categorization. In this paper we present an approach to text categorization in terms of genre and author for Modern Greek. In contrast to previous stylometric approaches, we attempt to take full advantage of existing natural language processing (NLP) tools. To this end, we propose a set of style markers including analysis-level measures that represent the way in which the input text has been analyzed and capture useful stylistic information without additional cost. We present a set of small-scale but reasonable experiments in text genre detection, author identification, and author verification tasks and show that the proposed method performs better than the most popular distributional lexical measures, i.e., functions of vocabulary richness and frequencies of occurrence of the most frequent words. All the presented experiments are based on unrestricted text downloaded from the World Wide Web without any manual text preprocessing or text sampling. Various performance issues regarding the training set size and the significance of the proposed style markers are discussed. Our system can be used in any application that requires fast and easily adaptable text categorization in terms of stylistically homogeneous categories. Moreover, the procedure of defining analysis-level markers can be followed in order to extract useful stylistic information using existing text processing tools.}, } @inProceedings{Stamatatos00a, author = {Efstathios Stamatatos and Nikos Fakotakis and George Kokkinakis}, title = {Text genre detection usign common word frequencies}, booktitle = {Proceedings of COLING-00, the 18th International Conference on Computational Linguistics}, year = {2000}, editor = {}, pages = {808--814}, address = {Saarbr{\"{u}}cken, DE}, url = {http://acl.ldc.upenn.edu/C/C00/C00-2117.pdf}, abstract = {In this paper we present a method for detecting the text genre quickly and easily following an approach originally proposed in authorship attribution studies which uses as style markers the frequencies of occurrence of the most frequent words in a training corpus (Burrows, 1992). In contrast to this approach we use the frequencies of occurrence of the most frequent words of the entire written language. Using as testing ground a part of the Wall Street Journal corpus, we show that the most frequent words of the British National Corpus, representing the most frequent words of the written English language, are more reliable discriminators of text genre in comparison to the most frequent words of the training corpus. Moreover, the frequencies of occurrence of the most common punctuation marks play an important role in terms of accurate text categorization as well as when dealing with training data of limited size.}, } @inProceedings{Sun01, author = {Aixin Sun and Ee-Peng Lim}, title = {Hierarchical Text Classification and Evaluation}, booktitle = {Proceedings of ICDM-01, IEEE International Conference on Data Mining}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {Nick Cercone and Tsau Y. Lin and Xindong Wu}, year = {2001}, address = {San Jose, CA}, pages = {521--528}, url = {http://www.cais.ntu.edu.sg:8000/~sunaixin/paper/sun_icdm01.pdf}, abstract = {Hierarchical Classification refers to assigning of one or more suitable categories from a hierarchical category space to a document. While previous work in hierarchical classification focused on virtual category trees where documents are assigned only to the leaf categories, we propose a top-down level-based classification method that can classify documents to both leaf and internal categories. As the standard performance measures assume independence between categories, they have not considered the documents incorrectly classified into categories that are similar or not far from the correct ones in the category tree. We therefore propose the Category-Similarity Measures and Distance-Based Measures to consider the degree of misclassification in measuring the classification performance. An experiment has been carried out to measure the performance of our proposed hierarchical classification method. The results showed that our method performs well for Reuters text collection when enough training documents are given and the new measures have indeed considered the contributions of misclassified documents.}, } @article{Sun03, author = {Aixin Sun and Ee-Peng Lim and Wee-Keong Ng}, title = {Performance Measurement Framework for Hierarchical Text Classification}, journal = {Journal of the American Society for Information Science and Technology}, year = {2003}, volume = {54}, number = {11}, pages = {1014--1028}, url = {http://www.cais.ntu.edu.sg/~sunaixin/paper/sun_jasist03.pdf}, abstract = {Hierarchical text classification or simply hierarchical classification refers to assigning a document to one or more suitable categories from a hierarchical category space. In our literature survey, we have found that the existing hierarchical classification experiments used a variety of measures to evaluate performance. These performance measures often assume independence between categories and do not consider documents misclassified into categories that are similar or not far from the correct categories in the category tree. In this paper, we therefore propose new performance measures for hierarchical classification. The proposed performance measures consist of category similarity measures and distance based measures that consider the contributions of misclassified documents. Our experiments on hierarchical classification methods based on SVM classifiers and binary Naive Bayes classifiers showed that SVM classifiers perform better than Naive Bayes classifiers on Reuters-21578 collection according to the extended measures. A new classifier-centric measure called blocking measure is also defined to examine the performance of subtree classifiers in a top-down level-based hierarchical classification method.}, } @inCollection{Sun03a, author = {Aixin Sun and Ee-Peng Lim and Wee-Keong Ng}, title = {Hierarchical Text Classification Methods and Their Specification}, booktitle = {Cooperative Internet Computing}, editor = {Alvin T. Chan and Stephen C. Chan and H. V. Leong and Vincent T. Y. Ng}, year = {2003}, pages = {236--256}, publisher = {Kluwer Academic Publishers}, address = {Dordrecht, NL}, url = {http://www.cais.ntu.edu.sg/~sunaixin/paper/sun_hcl.pdf}, abstract = {Hierarchical text classification refers to assigning text documents to the categories in a given category tree based on their content. With large number of categories organized as a tree, hierarchical text classification helps users to find information more quickly and accurately. Nevertheless, hierarchical text classi- fication methods in the past have often been constructed in a proprietary manner. The construction steps often involve human efforts and are not completely automated. In this chapter, we therefore propose a specification language known as HCL (Hierarchical Classification Language). HCL is designed to describe a hierarchical classification method including the definition of a category tree and training of classifiers associated with the categories. Using HCL, a hierarchical classification method can be materialized easily with the help of a method generator system.}, } @inProceedings{Sun03b, author = {Aixin Sun and Ee-Peng Lim}, title = {Web unit mining: finding and classifying subgraphs of web pages}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {108--115}, url = {http://doi.acm.org/10.1145/956863.956885}, abstract = {In web classification, most researchers assume that the objects to classify are individual web pages from one or more web sites. In practice, the assumption is too restrictive since a web page itself may not always correspond to a concept instance of some semantic concept (or category) given to the classification task. In this paper, we want to relax this assumption and allow a concept instance to be represented by a subgraph of web pages or a set of web pages. We identify several new issues to be addressed when the assumption is removed, and formulate the web unit mining problem. We also propose an iterative web unit mining (iWUM) method that first finds subgraphs of web pages using some knowledge about web site structure. From these web subgraphs, web units are constructed and classified into semantic concepts (or categories) in an iterative manner. Our experiments using the WebKB dataset showed that iWUM improves the overall classification performance and works very well on the more structured parts of a web site.}, } @inProceedings{Taghva00, author = {Taghva, Kazem and Nartker, Thomas A. and Julie Borsack and Steven Lumos and Allen Condit and Ron Young}, title = {Evaluating text categorization in the presence of OCR errors}, booktitle = {Proceedings of the 8th SPIE Conference on Document Recognition and Retrieval}, editor = {Paul B. Kantor and Daniel P. Lopresti and Jiangying Zhou}, year = {2000}, address = {San Jose, US}, pages = {68--74}, publisher = {SPIE, The International Society for Optical Engineering, Washington, US}, url = {}, abstract = {In this paper we describe experiments that investigate the effects of OCR errors on text categorization. In particular, we show that in our environment, OCR errors have no effect on categorization when we use a classifier based on the naive Bayes model. We also observe that dimensionality reduction techniques eliminate a large number of OCR errors and improve categorization results.}, } @inProceedings{Taira01, author = {Hirotoshi Taira and Masahiko Haruno}, title = {Text Categorization Using Transductive Boosting}, booktitle = {Proceedings of ECML-01, 12th European Conference on Machine Learning}, editor = {Luc De Raedt and Peter A. Flach}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Freiburg, DE}, year = {2001}, pages = {454--465}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2167}, url = {http://link.springer.de/link/service/series/0558/papers/2167/21670454.pdf}, abstract = {In natural language tasks like text categorization, we usually have an enormous amount of unlabeled data in addition to a small amount of labeled data. We present here a transductive boosting method for text categorization in order to make use of the large amount of unlabeled data efficiently. Our experiments show that the transductive method outperforms conventional boosting techniques that employ only labeled data.}, } @inProceedings{Taira99, author = {Hirotoshi Taira and Masahiko Haruno}, title = {Feature selection in SVM text categorization}, booktitle = {Proceedings of AAAI-99, 16th Conference of the American Association for Artificial Intelligence}, publisher = {AAAI Press, Menlo Park, US}, editor = {}, year = {1999}, address = {Orlando, US}, pages = {480--486}, url = {}, abstract = {Investigates the effect of prior feature selection in support vector machine (SVM) text categorization. The input space was gradually increased by using mutual information (MI) filtering and part-of-speech (POS) filtering, which determine the portion of words that are appropriate for learning from the information-theoretic and the linguistic perspectives, respectively. We tested the two filtering methods on SVMs as well as a decision tree algorithm, C4.5. The SVMs' results common to both filtering are that 1) the optimal number of features differed completely across categories, and 2) the average performance for all categories was best when all of the words were used. In addition, a comparison of the two filtering methods clarified that POS filtering on SVMs consistently outperformed MI filtering, which indicates that SVMs cannot find irrelevant parts of speech. These results suggest a simple strategy for the SVM text categorization: use a full number of words found through a rough filtering technique like part-of-speech tagging.}, } @inProceedings{Takamura01, author = {Hiroya Takamura and Yuji Matsumoto}, title = {Feature Space Restructuring for SVMs with Application to Text Categorization}, booktitle = {Proceedings of EMNLP-01, 6th Conference on Empirical Methods in Natural Language Processing}, year = {2001}, publisher = {Association for Computational Linguistics, Morristown, US}, editor = {Lillian Lee and Donna Harman}, pages = {51--57}, address = {Pittsburgh, US}, url = {http://www.cs.cornell.edu/home/llee/emnlp/papers/takamura.pdf}, abstract = {In this paper, we propose a new method of text categorization based on feature space restructuring for SVMs. In our method, independent components of document vectors are extracted using ICA and concatenated with the original vectors. This restructuring makes it possible for SVMs to focus on the latent semantic space without losing information given by the original feature space. Using this method, we achieved high performance in text categorization both with small number and large numbers of labeled data.}, } @inProceedings{Tan01, author = {Ah-Hwee Tan}, title = {Predictive Self-Organizing Networks for Text Categorization}, booktitle = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on Knowledge Discovery and Data Mining}, editor = {David Cheung and Qing Li and Graham Williams}, year = {2001}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Hong Kong, CN}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2035}, pages = {66--77}, url = {http://link.springer.de/link/service/series/0558/papers/2035/20350066.pdf}, abstract = {This paper introduces a class of predictive self-organizing neural networks known as Adaptive Resonance Associative Map (ARAM) for classification of free-text documents. Whereas most statistical approaches to text categorization derive classification knowledge based on training examples alone, ARAM performs supervised learning and integrates user-defined classification knowledge in the form of IF-THEN rules. Through our experiments on the Reuters-21578 news database, we showed that ARAM performed reasonably well in mining categorization knowledge from sparse and high dimensional document feature space. In addition, ARAM predictive accuracy and learning efficiency can be improved by incorporating a set of rules derived from the Reuters category description. The impact of rule insertion is most significant for categories with a small number of relevant documents.}, } @article{Tan02, author = {Chade-Meng Tan and Yuan-Fang Wang and Chan-Do Lee}, title = {The use of bigrams to enhance text categorization}, journal = {Information Processing and Management}, year = {2002}, volume = {38}, number = {4}, pages = {529--546}, url = {http://www.serve.com/cmtan/Meng/ig_m.pdf}, abstract = {In this paper, we present an efficient text categorization algorithm that generates bigrams selectively by looking for ones that have an especially good chance of being useful. The algorithm uses the information gain metric, combined with various frequency thresholds. The bigrams, along with unigrams, are then given as features to two different classifiers: Naive Bayes and maximum entropy. The experimental results suggest that the bigrams can substantially raise the quality of feature sets, showing increases in the break-even points and F1 measures. The McNemar test shows that in most categories the increases are very significant. Upon close examination of the algorithm, we concluded that the algorithm is most successful in correctly classifying more positive documents, but may cause more negative documents to be classified incorrectly.}, } @inProceedings{Taskar01, author = {Benjamin Taskar and Eran Segal and Daphne Koller}, title = {Probabilistic Classification and Clustering in Relational Data}, booktitle = {Proceeding of IJCAI-01, 17th International Joint Conference on Artificial Intelligence}, editor = {Bernhard Nebel}, address = {Seattle, US}, year = {2001}, pages = {870--878}, url = {http://robotics.stanford.edu/~btaskar/pubs/ijcai01.ps}, abstract = {Supervised and unsupervised learning methods have traditionally focused on data consisting of independent instances of a single type. However, many real-world domains are best described by relational models in which instances of multiple types are related to each other in complex ways. For example, in a scientific paper domain, papers are related to each other via citation, and are also related to their authors. In this case, the label of one entity (e.g., the topic of the paper) is often correlated with the labels of related entities. We propose a general class of models for classification and clustering in relational domains that capture probabilistic dependencies between related instances. We show how to learn such models efficiently from data. We present empirical results on two real world data sets. Our experiments in a transductive classification setting indicate that accuracy can be significantly improved by modeling relational dependencies. Our algorithm automatically induces a very natural behavior, where our knowledge about one instance helps us classify related ones, which in turn help us classify others. In an unsupervised setting, our models produced coherent clusters with a very natural interpretation, even for instance types that do not have any attributes.}, } @inProceedings{Taskar02, author = {Ben Taskar and Pieter Abbeel and Daphne Koller}, title = {Discriminative probabilistic models of relational data}, booktitle = {Proceedings of UAI-02, 18th Conference on Uncertainty in Artificial Intelligence}, year = {2002}, address = {Edmonton, CA}, pages = {485--492}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, editor = {}, url = {}, abstract = {In many supervised learning tasks, the entities to be labeled are related to each other in complex ways and their labels are not independent. For example, in hypertext classification, the labels of linked pages are highly correlated. A standard approach is to classify each entity independently, ignoring the correlations between them. Recently, Probabilistic Relational Models, a relational version of Bayesian networks, were used to define a joint probabilistic model for a collection of related entities. In this paper, we present an alternative framework that builds on (conditional) Markov networks and addresses two limitations of the previous approach. First, undirected models do not impose the acyclicity constraint that hinders representation of many important relational dependencies in directed models. Second, undirected models are well suited for discriminative training, where we optimize the conditional likelihood of the labels given the features, which generally improves classification accuracy. We show how to train these models effectively, and how to use approximate probabilistic inference over the learned model for collective classification of multiple related entities. We provide experimental results on a webpage classification task, showing that accuracy can be significantly improved by modeling relational dependencies.}, } @article{Tauritz00, author = {Daniel R. Tauritz and Joost N. Kok and Ida G. Sprinkhuizen-Kuyper}, title = {Adaptive information filtering using evolutionary computation}, journal = {Information Sciences}, year = {2000}, volume = {122}, number = {2/4}, pages = {121--140}, url = {http://www.elsevier.nl/gej-ng/10/23/143/56/27/27/article.pdf}, abstract = {Information Filtering is concerned with filtering data streams in such a way as to leave only pertinent data (information) to be perused. When the data streams are produced in a changing environment the filtering has to adapt too in order to remain effective. Adaptive Information Filtering (AIF) is concerned with filtering in changing environments. The changes may occur both on the transmission side (the nature of the streams can change), and on the reception side (the interest of a user can change). Weighted trigram analysis is a quick and flexible technique for describing the contents of a document. A novel application of evolutionary computation is its use in Adaptive Information Filtering for optimizing various parameters, notably the weights associated with trigrams. The research described in this paper combines weighted trigram analysis, clustering, and a special two-pool evolutionary algorithm, to create an Adaptive Information Filtering system with such useful properties as domain independence, spelling error insensitivity, adaptability, and optimal use of user feedback while minimizing the amount of user feedback required to function properly. We designed a special evolutionary algorithm with a two-pool strategy for this changing environment.}, } @inProceedings{Tauritz99, author = {Daniel R. Tauritz and Ida G. Sprinkhuizen-Kuyper}, title = {Adaptive Information Filtering Algorithms}, booktitle = {Proceedings of IDA-99, 3rd Symposium on Intelligent Data Analysis}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1642}, editor = {David J. Hand and Joost N. Kok and Michael R. Berthold}, address = {Amsterdam, NL}, year = {1999}, pages = {513--524}, url = {http://link.springer.de/link/service/series/0558/papers/1642/16420513.pdf}, abstract = {Adaptive information filtering is concerned with filtering information streams in changing environments. The changes may occur both on the transmission side (the nature of the streams can change) and on the reception side (the interests of a user can change). The research described in this paper details the progress made in a prototype adaptive information filtering system based on weighted trigram analysis and evolutionary computation. The main improvements of the algorithms employed by the system concern the computation of the distance between weighted trigram vectors and further analysis of the two-pool evolutionary algorithm. We tested our new prototype system on the Reuters-21578 text categorization test collection.}, } @inProceedings{Teahan00, author = {William J. Teahan}, title = {Text classification and segmentation using minimum cross-entropy}, booktitle = {Proceeding of RIAO-00, 6th International Conference ``Recherche d'Information Assistee par Ordinateur''}, editor = {}, address = {Paris, FR}, year = {2000}, pages = {}, url = {}, abstract = {Several methods for classifying and segmenting text are described. These are based on ranking text sequences by their cross-entropy calculated using a fixed order character-based Markov model adapted from the PPM text compression algorithm. Experimental results show that the methods are a significant improvement over previously used methods in a number of areas. For example, text can be classified with a very high degree of accuracy by authorship, language, dialect and genre. Highly accurate text segmentation is also possible - the accuracy of the PPM-based Chinese word segmenter is close to 99\% on Chinese news text; similarly, a PPM-based method of segmenting text by language achieves an accuracy of over 99\%.}, } @inProceedings{Teytaud01, author = {Teytaud, Olivier and Jalam, Radwan}, title = {Kernel based text categorization}, booktitle = {Proceeding of IJCNN-01, 12th International Joint Conference on Neural Networks}, editor = {}, address = {Washington, US}, year = {2001}, pages = {}, url = {}, abstract = {}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, } @inProceedings{Theeramunkong02, author = {Thanaruk Theeramunkong and Verayuth Lertnattee}, title = {Multi-Dimensional Text Classification}, booktitle = {Proceedings of COLING-02, the 19th International Conference on Computational Linguistics}, year = {2002}, editor = {}, pages = {}, address = {Taipei, TW}, url = {http://acl.ldc.upenn.edu/coling2002/proceedings/data/area-28/co-399.pdf}, abstract = {This paper proposes a multi-dimensional framework for classifying text documents. In this framework, the concept of multidimensional category model is introduced for representing classes. In contrast with traditional flat and hierarchical category models, the multi-dimensional category model classifies each text document in a collection using multiple predefined sets of categories, where each set corresponds to a dimension. Since a multi-dimensional model can be converted to flat and hierarchical models, three classification strategies are possible, i.e., classifying directly based on the multi-dimensional model and classifying with the equivalent flat or hierarchical models. The efficiency of these three classifications is investigated on two data sets. Using k-NN, naive Bayes and centroid-based classifiers, the experimental results show that the multi-dimensional-based and hierarchical-based classification performs better than the flat-based classifications.}, } @inProceedings{Thompson01, author = {Paul Thompson}, title = {Automatic categorization of case law}, booktitle = {Proceedings of ICAIL-01, 8th International Conference on Artificial Intelligence and Law}, editor = {}, year = {2001}, address = {St.\ Louis, US}, pages = {70--77}, publisher = {ACM Press, New York, US}, url = {http://doi.acm.org/10.1145/383535.383543}, abstract = {This paper describes a series of automatic text categorization experiments with case law documents. Cases are categorized into 40 broad, high-level categories. These results are compared to an existing operational process using Boolean queries manually constructed by domain experts. In this categorization process recall is considered more important than precision. This paper investigates three algorithms that potentially could automate this categorization process: 1) a nearest neighbor-like algorithm, 2) C4.5rules, a machine learning decision tree algorithm; and 3) Ripper, a machine learning rule induction algorithm. The results obtained by Ripper surpass those of the operational process.}, } @inProceedings{Tong00, author = {Simon Tong and Daphne Koller}, title = {Support Vector Machine Active Learning with Applications to Text Classification}, booktitle = {Proceedings of ICML-00, 17th International Conference on Machine Learning}, editor = {Pat Langley}, year = {2000}, address = {Stanford, US}, pages = {999--1006}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.robotics.stanford.edu/~stong/papers/tong_koller_ml00.ps.gz}, abstract = {Support vector machines have met with significant success in numerous real-world learning tasks. However, like most machine learning algorithms, they are generally applied using a randomly selected training set classified in advance. In many settings, we also have the option of using pool-based active learning. Instead of using a randomly selected training set, the learner has access to a pool of unlabeled instances and can request the labels for some number of them. We introduce an new algorithm for performing active learning with support vector machines, i.e., an algorithm for choosing which instances to request next. We provide a theoretical motivation for the algorithm. We present experimental results showing that employing our active learning method can significantly reduce the need for labeled training instances in both the standard inductive and transductive settings.}, note = {An extended version appears as \cite{Tong01}}, } @article{Tong01, author = {Simon Tong and Daphne Koller}, title = {Support Vector Machine Active Learning with Applications to Text Classification}, journal = {Journal of Machine Learning Research}, volume = {2}, month = {November}, pages = {45--66}, year = {2001}, url = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/tong01a/tong01a.pdf}, abstract = {Support vector machines have met with significant success in numerous real-world learning tasks. However, like most machine learning algorithms, they are generally applied using a randomly selected training set classified in advance. In many settings, we also have the option of using pool-based active learning. Instead of using a randomly selected training set, the learner has access to a pool of unlabeled instances and can request the labels for some number of them. We introduce a new algorithm for performing active learning with support vector machines, i.e., an algorithm for choosing which instances to request next. We provide a theoretical motivation for the algorithm using the notion of a version space. We present experimental results showing that employing our active learning method can significantly reduce the need for labeled training instances in both the standard inductive and transductive settings.}, } @inProceedings{Tong92, author = {Richard Tong and Adam Winkler and Pamela Gage}, title = {Classification Trees for Document Routing: A Report on the TREC Experiment}, booktitle = {Proceedings of TREC-1, 1st Text Retrieval Conference}, publisher = {National Institute of Standards and Technology, Gaithersburg, US}, editor = {Donna K. Harman}, year = {1992}, address = {Gaithersburg, US}, pages = {209--228}, url = {http://trec.nist.gov/pubs/trec1/papers/17.txt}, abstract = {Describes an approach to document routing on the TREC corpus that employs a technique for the automatic construction of classification trees. The approach makes use of the Classification and Regression Trees (CART) algorithm that has seen application in various areas of machine learning. The authors' initial work with this algorithm has demonstrated that probabilistic structures can be automatically acquired from a training set of documents with respect to a single target concept, or a set of related concepts. These structures can then be applied to individual documents to derive a posterior probability that the document is about a particular target concept.}, } @inProceedings{Toutanova01, author = {Kristina Toutanova and Francine Chen and Kris Popat and Thomas Hofmann}, title = {Text Classification in a Hierarchical Mixture Model for Small Training Sets}, booktitle = {Proceedings of CIKM-01, 10th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Henrique Paques and Ling Liu and David Grossman}, year = {2001}, address = {Atlanta, US}, pages = {105--113}, url = {http://www.stanford.edu/~krist/papers/cikm2001.pdf}, abstract = {Documents are commonly categorized into hierarchies of topics, such as the ones maintained by Yahoo! and the Open Directory project, in order to facilitate browsing and other interactive forms of information retrieval. In addition, topic hierarchies can be utilized to overcome the sparseness problem in text categorization with a large number of categories, which is the main focus of this paper. This paper presents a hierarchical mixture model which extends the standard naive Bayes classifier and previous hierarchical approaches. Improved estimates of the term distributions are made by differentiation of words in the hierarchy according to their level of generality/specificity. Experiments on the Newsgroups and the Reuters-21578 dataset indicate improved performance of the proposed classifier in comparison to other state-of-the-art methods on datasets with a small number of positive examples.}, } @article{Tsay04, author = {Jyh-Jong Tsay and Jing-Doo Wang}, title = {Improving linear classifier for Chinese text categorization}, journal = {Information Processing and Management}, year = {2004}, volume = {40}, number = {2}, pages = {223--237}, url = {}, abstract = {}, } @article{Turney00, author = {Peter D. Turney}, title = {Learning Algorithms for Keyphrase Extraction}, journal = {Information Retrieval}, number = {4}, volume = {2}, pages = {303--336}, year = {2000}, url = {http://extractor.iit.nrc.ca/reports/IR2000.ps.Z}, abstract = {Many academic journals ask their authors to provide a list of about five to fifteen keywords, to appear on the first page of each article. Since these key words are often phrases of two or more words, we prefer to call them keyphrases. There is a wide variety of tasks for which keyphrases are useful, as we discuss in this paper. We approach the problem of automatically extracting keyphrases from text as a supervised learning task. We treat a document as a set of phrases, which the learning algorithm must learn to classify as positive or negative examples of keyphrases. Our first set of experiments applies the C4.5 decision tree induction algorithm to this learning task. We evaluate the performance of nine different configurations of C4.5. The second set of experiments applies the GenEx algorithm to the task. We developed the GenEx algorithm specifically for automatically extracting keyphrases from text. The experimental results support the claim that a custom-designed algorithm (GenEx), incorporating specialized procedural domain knowledge, can generate better keyphrases than a general-purpose algorithm (C4.5). Subjective human evaluation of the keyphrases generated by Extractor suggests that about 80\% of the keyphrases are acceptable to human readers. This level of performance should be satisfactory for a wide variety of applications.}, } @inProceedings{Tzeras93, author = {Tzeras, Konstadinos and Hartmann, Stephan}, title = {Automatic indexing based on Bayesian inference networks}, booktitle = {Proceedings of SIGIR-93, 16th ACM International Conference on Research and Development in Information Retrieval}, editor = {Robert Korfhage and Edie Rasmussen and Peter Willett}, publisher = {ACM Press, New York, US}, address = {Pittsburgh, US}, pages = {22--34}, year = {1993}, url = {http://www.darmstadt.gmd.de/~tzeras/FullPapers/gz/Tzeras-Hartmann-93.ps.gz}, abstract = {In this paper, a Bayesian inference network model for automatic indexing with index terms (descriptors) from a prescribed vocabulary is presented. It requires an indexing dictionary with rules mapping terms of the respective subject field onto descriptors and inverted lists for terms occurring in a set of documents of the subject field and descriptors manually assigned to these documents. The indexing dictionary can be derived automatically from a set of manually indexed documents. An application of the network model is described, followed by an indexing example and some experimental results about the indexing performance of the network model.}, } @article{Uren02, author = {Victoria S. Uren and Thomas R. Addis}, title = {How weak categorizers based upon different principles strengthen performance}, journal = {The Computer Journal}, year = {2002}, volume = {45}, number = {5}, pages = {511--524}, url = {http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_05/pdf/450511.pdf}, abstract = {Combining the results of classifiers has shown much promise in machine learning generally. However, published work on combining text categorizers suggests that, for this particular application, improvements in performance are hard to attain. Explorative research using a simple voting system is presented and discussed in the light of a probabilistic model that was originally developed for safety critical software. It was found that typical categorization approaches produce predictions which are too similar for combining them to be effective since they tend to fail on the same records. Further experiments using two less orthodox categorizers are also presented which suggest that combining text categorizers can be successful, provided the essential element of 'difference' is considered.}, } @article{Urena01, author = {L. Alfonso Ure{\~{n}}a-L{\'{o}}pez and Manuel Buenaga and Jos{\'{e}} M. G{\'{o}}mez}, title = {Integrating linguistic resources in TC through WSD}, journal = {Computers and the Humanities}, year = {2001}, number = {2}, volume = {35}, pages = {215--230}, url = {http://www.wkap.nl/article.pdf?266250}, abstract = {Information access methods must be improved to overcome the information overload that most professionals face nowadays. Text classification tasks, like text categorization, help the users to access to the great amount of text they find in the Internet and their organizations. TC is the classification of documents into a predefined set of categories. Most approaches to automatic TC are based on the utilization of a training collection, which is a set of manually classified documents. Other linguistic resources that are emerging, like lexical databases, can also be used for classification tasks. This article describes an approach to TC based on the integration of a training collection (Reuters-21578) and a lexical database (WORDNET 1.6) as knowledge sources. Lexical databases accumulate information on the lexical items of one or several languages. This information must be filtered in order to make an effective use of it in our model of TC. This filtering process is a word sense disambiguation task. WSD is the identification of the sense of words in context. This task is an intermediate process in many natural language processing tasks like machine translation or multilingual information retrieval. We present the utilization of WSD as an aid for TC. Our approach to WSD is also based on the integration of two linguistic resources: a training collection (SEMCOR and Reuters-21578) and a lexical database (WORDNET 1.6).}, } @inProceedings{Vert01, author = {Jean-Philippe Vert}, title = {Text Categorization Using Adaptive Context Trees}, booktitle = {Proceedings of CICLING-01, 2nd International Conference on Computational Linguistics and Intelligent Text Processing}, year = {2001}, editor = {Alexander Gelbukh}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Mexico City, ME}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2004}, pages = {423--436}, url = {http://link.springer.de/link/service/series/0558/papers/2004/20040423.pdf}, abstract = {A new way of representing texts written in natural language is introduced, as a conditional probability distribution at the letter level learned with a variable length Markov model called adaptive context tree model. Text categorization experiments demonstrates the ability of this representation to catch information about the semantic content of the text.}, } @inProceedings{Viechnicki98, author = {Peter Viechnicki}, title = {A Performance Evaluation of Automatic Survey Classifiers}, booktitle = {Proceedings of ICGI-98, 4th International Colloquium on Grammatical Inference}, address = {Ames, US}, editor = {Vasant Honavar and Giora Slutzki}, year = {1998}, pages = {244--256}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1433}, } @inProceedings{Vinciarelli04, author = {Vinciarelli, Alessandro}, title = {Noisy Text Categorization}, booktitle = {Proceedings of ICPR-04, 17th International Conference on Pattern Recognition}, year = {2004}, address = {Cambridge, UK}, abstract = {This work presents a system for the categorization of noisy texts. By noisy it is meant any text obtained through an extraction process (affected by errors) from media different than digital texts. We show that, even with an average Word Error Rate of around 50\%, the categorization performance loss with respect to the clean version of the same documents is negligible.}, url = {ftp://ftp.idiap.ch/pub/reports/2003/rr03-61.pdf}, } @inProceedings{Vinokourov01, author = {Alexei Vinokourov and Mark Girolami}, title = {Document Classification Employing the Fisher Kernel Derived from Probabilistic Hierarchic Corpus Representations}, booktitle = {Proceedings of ECIR-01, 23rd European Colloquium on Information Retrieval Research}, editor = {}, year = {2001}, address = {Darmstadt, DE}, publisher = {}, pages = {24--40}, url = {http://cis.paisley.ac.uk/vino-ci0/fisher_hierarchic.ps}, abstract = {This paper demonstrates that the probabilistic corpus model which emerges from the automatic or unsupervised hierarchical organisation of a document collection can be further exploited to create a kernel which boosts the performance of state-of-the-art Support Vector Machine document classifiers. It is demonstrated that the performance of such a classifier is further enhanced when employing the kernel derived from an appropriate hierarchic mixture model used for partitioning a document corpus rather than the kernel associated with a at non-hierarchic mixture model. This has important implications for document classification when a hierarchic ordering of topics exists. This can be considered as the effective combination of documents with no topic or class labels (unlabeled data), labeled documents, and prior domain knowledge (in the form of the known hierarchic structure), in providing enhanced document classification performance.}, } @article{Vinokourov02, author = {Alexei Vinokourov and Mark Girolami}, title = {A Probabilistic Framework for the Hierarchic Organisation and Classification of Document Collections}, journal = {Journal of Intelligent Information Systems}, year = {2002}, note = {Special Issue on Automated Text Categorization}, volume = {18}, number = {2/3}, pages = {153--172}, url = {http://www.wkap.nl/article.pdf?391244}, abstract = {This paper presents a probabilistic mixture modeling framework for the hierarchic organisation of document collections. It is demonstrated that the probabilistic corpus model which emerges from the automatic or unsupervised hierarchical organisation of a document collection can be further exploited to create a kernel which boosts the performance of state-of-the-art Support Vector Machine document classifiers. It is shown that the performance of such a classifier is further enhanced when employing the kernel derived from an appropriate hierarchic mixture model used for partitioning a document corpus rather than the kernel associated with a flat non-hierarchic mixture model. This has important implications for document classification when a hierarchic ordering of topics exists. This can be considered as the effective combination of documents with no topic or class labels (unlabeled data), labeled documents, and prior domain knowledge (in the form of the known hierarchic structure), in providing enhanced document classification performance.}, } @inProceedings{Vinot03, author = {Romain Vinot and Fran{\c{c}}ois Yvon}, title = {Improving Rocchio with Weakly Supervised Clustering}, booktitle = {Proceedings of ECML-03, 14th European Conference on Machine Learning}, publisher = {Springer Verlag, Heidelberg, DE}, editor = {}, year = {2003}, address = {Dubrovnik, HK}, pages = {456--467}, url = {}, abstract = {This paper presents a novel approach for adapting the complexity of a text categorization system to the difficulty of the task. In this study, we adapt a simple text classifier (Rocchio), using weakly supervised clustering techniques. The idea is to identify sub-topics of the original classes which can help improve the categorization process. To this end, we propose several clustering algorithms, and report results of various evaluations on standard benchmark corpora such as the Newsgroups corpus.}, } @inProceedings{Wang00, author = {Wang, Wenxian and Meng, Weiyi and Yu, Clement}, title = {Concept hierarchy based text database categorization in a metasearch engine environment}, booktitle = {Proceedings of WISE-00, 1st International Conference on Web Information Systems Engineering}, editor = {Li, Qing and Ozsoyoglu, Z. Meral and Wagner, Roland and Kambayashi, Yahiko and Zhang, Yanchun}, pages = {283--290}, year = {2000}, address = {Hong Kong, CN}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, volume = {1}, url = {http://panda.cs.binghamton.edu/~meng/pub.d/wise00.doc}, abstract = {Document categorization, as a technique to improve the retrieval of useful documents, has been extensively investigated. One important issue in a large-scale meta-search engine is to select text databases that are likely to contain useful documents for a given query. We believe that database categorization can be a potentially effective technique for good database selection, especially in the Internet environment, where short queries are usually submitted. In this paper, we propose and evaluate several database categorization algorithms. This study indicates that, while some document categorization algorithms could be adopted for database categorization, algorithms that take into consideration the special characteristics of databases may be more effective. Preliminary experimental results are provided to compare the proposed database categorization algorithms.}, } @inProceedings{Wang01, author = {Ke Wang and Senquiang Zhou and Yu He}, title = {Hierarchical Classification of Real Life Documents}, booktitle = {Proceedings of the 1st SIAM International Conference on Data Mining}, publisher = {}, editor = {}, year = {2001}, address = {Chicago, US}, pages = {}, url = {http://www.cs.sfu.ca/~wangk/pub/sdm2001.ps}, abstract = {}, } @inProceedings{Wang04, author = {Gang Wang and Frederick H. Lochovsky}, title = {Feature Selection with Conditional Mutual Information MaxiMin in Text Categorization}, booktitle = {Proceedings of CIKM-04, 13th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, address = {Washington, US}, editor = {David A. Evans and Luis Gravano and Otthein Herzog and ChengXiang Zhai and Marc Ronthaler}, year = {2004}, pages = {342--349}, url = {}, abstract = {}, } @inProceedings{Wang99, author = {Hui Wang and Nguyen H. Son}, title = {Text classification using lattice machine}, booktitle = {Proceedings of ISMIS-99, 11th International Symposium on Methodologies for Intelligent Systems}, editor = {Andrzej Skowron and Zbigniew W. Ra{\'{s}}}, pages = {235--243}, year = {1999}, address = {Warsaw, PL}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1609}, url = {}, abstract = {}, } @inProceedings{Wang99a, author = {Ke Wang and Senquiang Zhou and Shiang Chen Liew}, title = {Building hierarchical classifiers using class proximity}, booktitle = {Proceedings of VLDB-99, 25th International Conference on Very Large Data Bases}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, editor = {Malcolm P. Atkinson and Maria E. Orlowska and Patrick Valduriez and Stanley B. Zdonik and Michael L. Brodie}, year = {1999}, address = {Edinburgh, UK}, pages = {363--374}, url = {http://www.comp.nus.edu.sg/~wangk/pub/vldb99.ps}, abstract = {We explore how to organize a text database hierarchically to aid better searching and browsing. We propose to exploit the natural hierarchy of topics, or taxonomy, that many corpora, such as internet directories, digital libraries, and patent databases enjoy. In our system, the user navigates through the query response not as a flat unstructured list, but embedded in the familiar taxonomy, and annotated with document signatures computed dynamically with respect to where the user is located at any time. We show how to update such databases with new documents with high speed and accuracy. We use techniques from statistical pattern recognition to efficiently separate the feature words or discriminants from the noise words at each node of the taxonomy. Using these, we build a multi-level classifier. At each node, this classifier can ignore the large number of noise words in a document. Thus the classifier has a small model size and is very fast. However, owing to the use of context-sensitive features, the classifier is very accurate. We report on experiences with the Reuters newswire benchmark, the US Patent database, and web document samples from {{\sc Yahoo!}}\.}, } @inProceedings{Wei01, author = {Chih-Ping Wei and Yuan-Xin Dong}, title = {A Mining-based Category Evolution Approach to Managing Online Document Categories}, booktitle = {Proceedings of HICSS-01, 34th Annual Hawaii International Conference on System Sciences}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {Ralph H. Sprague}, year = {2001}, address = {Maui, US}, pages = {}, url = {http://dlib.computer.org/conferen/hicss/0981/pdf/09817061.pdf}, abstract = {With rapid expansion of the numbers and sizes of text repositories and improvements in global connectivity, the quantity of information available online as free-format text is growing exponentially. Many large organizations create and maintain huge volumes of textual information online, and there is a pressing need for support of efficient and effective information retrieval, filtering, and management. Text categorization, or the assignment of textual documents to one or more pre-defined categories based on their content, is an essential component of efficient management and retrieval of documents. Previously, research has focused predominantly on developing or adopting statistical classification or inductive learning methods for automatically discovering text categorization patterns for a pre-defined set of categories. However, as documents accumulate, such categories may not capture a document's characteristics correctly. In this study, we proposed a mining-based category evolution (MiCE) technique to adjust document categories based on existing categories and their associated documents. Empirical evaluation results indicate that the proposed technique, MiCE, was more effective than the category discovery approach and was insensitive to the quality of original categories.}, } @article{Weigend99, author = {Andreas S. Weigend and Erik D. Wiener and Jan O. Pedersen}, title = {Exploiting hierarchy in text categorization}, journal = {Information Retrieval}, number = {3}, volume = {1}, pages = {193--216}, year = {1999}, url = {http://www.stern.nyu.edu/~aweigend/Research/Papers/TextCategorization/hierarchy.ps}, abstract = {With the recent dramatic increase in electronic access to documents, text categorization-the task of assigning topics to a given document-has moved to the center of the information sciences and knowledge management. This article uses the structure that is present in the semantic space of topics in order to improve performance in text categorization: according to their meaning, topics can be grouped together into ``meta-topics'', e.g., gold, silver, and copper are all metals. The proposed architecture matches the hierarchical structure of the topic space, as opposed to a flat model that ignores the structure. It accommodates both single and multiple topic assignments for each document. Its probabilistic interpretation allows its predictions to be combined in a principled way with information from other sources. The first level of the architecture predicts the probabilities of the meta-topic groups. This allows the individual models for each topic on the second level to focus on finer discriminations within the group. Evaluating the performance of a two-level implementation on the Reuters-22173 testbed of newswire articles shows the most significant improvement for rare classes.}, } @article{Weiss99, author = {Sholom M. Weiss and Chidanand Apt\'{e} and Fred J. Damerau and David E. Johnson and Frank J. Oles and Thilo Goetz and Thomas Hampp}, title = {Maximizing text-mining performance}, journal = {IEEE Intelligent Systems}, year = {1999}, number = {4}, volume = {14}, pages = {63--69}, url = {http://www.research.ibm.com/dar/papers/pdf/ieee99_mtmp.pdf}, abstract = {With the advent of centralized data warehouses, where data might be stored as electronic documents or as text fields in databases, text mining has increased in importance and economic value. One important goal in text mining is automatic classification of electronic documents. Computer programs scan text in a document and apply a model that assigns the document to one or more prespecified topics. Researchers have used benchmark data, such as the Reuters-21578 test collection, to measure advances in automated text categorization. Conventional methods such as decision trees have had competitive, but not optimal, predictive performance. Using the Reuters collection, we show that adaptive resampling techniques can improve decision-tree performance and that relatively small, pooled local dictionaries are effective. We've applied these techniques to online banking applications to enhance automated e-mail routing.}, } @article{Wermter00, author = {Stefan Wermter}, title = {Neural Network Agents for Learning Semantic Text Classification}, journal = {Information Retrieval}, number = {2}, volume = {3}, pages = {87--103}, year = {2000}, url = {http://www.his.sunderland.ac.uk/ps/ir4.pdf}, abstract = {The research project AgNeT develops Agents for Neural Text routing in the internet. Unrestricted potentially faulty text messages arrive at a certain delivery point (e.g. email address or world wide web address). These text messages are scanned and then distributed to one of several expert agents according to a certain task criterium. Possible specific scenarios within this framework include the learning of the routing of publication titles or news titles. In this paper we describe extensive experiments for semantic text routing based on classified library titles and newswire titles. This task is challenging since incoming messages may contain constructions which have not been anticipated. Therefore, the contributions of this research are in learning and generalizing neural architectures for the robust interpretation of potentially noisy unrestricted messages. Neural networks were developed and examined for this topic since they support robustness and learning in noisy unrestricted real-world texts. We describe and compare different sets of experiments. The first set of experiments tests a recurrent neural network for the task of library title classification. Then we describe a larger more difficult newswire classification task from information retrieval. The comparison of the examined models demonstrates that techniques from information retrieval integrated into recurrent plausibility networks performed well even under noise and for different corpora.}, } @inProceedings{Wermter02, author = {Stefan Wermter and Chihli Hung}, title = {Selforganizing classification on the Reuters news corpus}, booktitle = {Proceedings of COLING-02, the 19th International Conference on Computational Linguistics}, year = {2002}, editor = {}, pages = {}, address = {Taipei, TW}, url = {http://www.his.sunderland.ac.uk/ps/coling-232.pdf}, abstract = {In this paper we propose an integration of a selforganizing map and semantic networks from WordNet for a text classification task using the new Reuters news corpus. This neural model is based on significance vectors and benefits from the presentation of document clusters. The Hypernym relation in WordNet supplements the neural model in classification. We also analyse the relationships of news headlines and their contents of the new Reuters corpus by a series of experiments. This hybrid approach of neural selforganization and symbolic hypernym relationships is successful to achieve good classification rates on 100,000 full-text news articles. These results demonstrate that this approach can scale up to a large real-world task and show a lot of potential for text classification.}, } @inProceedings{Wermter99, author = {Stefan Wermter and Christo Panchev and Garen Arevian}, title = {Hybrid Neural Plausibility Networks for News Agents}, booktitle = {Proceedings of AAAI-99, 16th Conference of the American Association for Artificial Intelligence}, publisher = {AAAI Press, Menlo Park, US}, editor = {}, year = {1999}, pages = {93--98}, address = {Orlando, US}, url = {http://www.his.sunderland.ac.uk/ps/aaai99.pdf}, abstract = {This paper describes a learning news agent HyNeT which uses hybrid neural network techniques for classifying news titles as they appear on an internet newswire. Recurrent plausibility networks with local memory are developed and examined for learning robust text routing. HyNeT is described for the first time in this paper. We show that a careful hybrid integration of techniques from neural network architectures, learning and information retrieval can reach consistent recall and precision rates of more than 92\% on an 82,000 word corpus; this is demonstrated for 10,000 unknown news titles from the Reuters newswire. This new synthesis of neural networks, learning and information retrieval techniques allows us to scale up to a real-world task and demonstrates a lot of potential for hybrid plausibility networks for semantic text routing agents on the internet.}, } @inProceedings{Wermter99a, author = {Stefan Wermter and Garen Arevian and Christo Panchev}, title = {Recurrent Neural Network Learning for Text Routing}, booktitle = {Proceedings of ICANN-99, 9th International Conference on Artificial Neural Networks}, publisher = {Institution of Electrical Engineers, London, UK}, editor = {}, year = {1999}, pages = {898--903}, address = {Edinburgh, UK}, url = {http://www.his.sunderland.ac.uk/ps/icann99.pdf}, abstract = {This paper describes new recurrent plausibility networks with internal recurrent hysteresis connections. These recurrent connections in multiple layers encode the sequential context of word sequences. We show how these networks can support text routing of noisy newswire titles according to different given categories. We demonstrate the potential of these networks using an 82,339 word corpus from the Reuters newswire, reaching recall and precision rates above 92\%. In addition, we carefully analyze the internal representation using cluster analysis and output representations using a new surface error technique. In general, based on the current recall and precision performance, as well as the detailed analysis, we show that recurrent plausibility networks hold a lot of potential for developing learning and robust newswire agents for the internet.}, } @inProceedings{Wibowo02, author = {Wahyu Wibowo and Hugh E. Williams}, title = {Simple and accurate feature selection for hierarchical categorisation}, booktitle = {Proceedings of the 2002 ACM Symposium on Document engineering}, publisher = {ACM Press, New York, US}, editor = {}, year = {2002}, address = {McLean, US}, pages = {111--118}, url = {http://doi.acm.org/10.1145/585058.585079}, abstract = {Categorisation of digital documents is useful for organisation and retrieval. While document categories can be a set of unstructured category labels, some document categories are hierarchically structured. This paper investigates automatic hierarchical categorisation and, specifically, the role of features in the development of more effective categorisers. We show that a good hierarchical machine learning-based categoriser can be developed using small numbers of features from pre-categorised training documents. Overall, we show that by using a few terms, categorisation accuracy can be improved substantially: unstructured leaf level categorisation can be improved by up to 8.6\%, while top-down hierarchical categorisation accuracy can be improved by up to 12\%. In addition, unlike other feature selection models --- which typically require different feature selection parameters for categories at different hierarchical levels --- our technique works equally well for all categories in a hierarchical structure. We conclude that, in general, more accurate hierarchical categorisation is possible by using our simple feature selection technique.}, } @inProceedings{Wiener95, author = {Erik D. Wiener and Jan O. Pedersen and Andreas S. Weigend}, title = {A neural network approach to topic spotting}, booktitle = {Proceedings of SDAIR-95, 4th Annual Symposium on Document Analysis and Information Retrieval}, publisher = {}, editor = {}, year = {1995}, address = {Las Vegas, US}, pages = {317--332}, url = {http://www.stern.nyu.edu/~aweigend/Research/Papers/TextCategorization/Wiener.Pedersen.Weigend_SDAIR95.ps}, abstract = {This paper presents an application of nonlinear neural networks to topic spotting. Neural networks allow us to model higher-order interaction between document terms and to simultaneously predict multiple topics using shared hidden features. In the context of this model, we compare two approaches to dimensionality reduction in representation: one based on term selection and another based on Latent Semantic Indexing (LSI). Two different methods are proposed for improving LSI representations for the topic spotting task. We find that term selection and our modified LSI representations lead to similar topic spotting performance, and that this performance is equal to or better than other published results on the same corpus.}, } @mastersThesis{Wiener95a, author = {Erik D. Wiener}, title = {A neural network approach to topic spotting in text}, school = {Department of Computer Science, University of Colorado at Boulder}, address = {Boulder, US}, year = {1995}, url = {http://www.stern.nyu.edu/~aweigend/Research/Papers/TextCategorization/Wiener_Thesis95.ps}, abstract = {This paper presents an application of nonlinear neural networks to topic spotting. Neural networks allow us to model higher-order interaction between document terms and to simultaneously predict multiple topics using shared hidden features. In the context of this model, we compare two approaches to dimensionality reduction in representation: one based on term selection and another based on Latent Semantic Indexing (LSI). Two different methods are proposed for improving LSI representations for the topic spotting task. We find that term selection and our modified LSI representations lead to similar topic spotting performance, and that this performance is equal to or better than other published results on the same corpus.}, } @article{Wong96, author = {Jacqueline W. Wong and Wing-Kay Kan and Gilbert H. Young}, title = {{{\sc Action}}: automatic classification for full-text documents}, journal = {SIGIR Forum}, year = {1996}, volume = {30}, number = {1}, pages = {26--41}, url = {}, abstract = {}, } @article{Wu04, author = {Kuo-Jui Wu and Menc-Chang Chen and Yeali Sun}, title = {Automatic topics discovery from hyperlinked documents}, journal = {Information Processing and Management}, year = {2004}, volume = {40}, number = {2}, pages = {239--255}, url = {}, abstract = {}, } @inProceedings{Wu04a, author = {Xiaoyun Wu and Rohini Srihari and Zhaohui Zheng}, title = {Document representation for one-class SVM}, booktitle = {Proceedings of ECML-04, 15th European Conference on Machine Learning}, editor = {Jean-Fran{\c{c}}ois Boulicaut and Floriana Esposito and Fosca Giannotti and Dino Pedreschi}, address = {Pisa, IT}, pages = {489--500}, year = {2004}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 3201}, url = {}, abstract = {}, } @inProceedings{Xu03, author = {Zhao Xu and Kai Yu and Volker Tresp and Xiaowei Xu and Jizhi Wang}, title = {Representative sampling for text classification using support vector machines}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {393--407}, url = {http://link.springer.de/link/service/series/0558/papers/2633/26330393.pdf}, abstract = {In order to reduce human efforts, there has been increasing interest in applying active learning for training text classifiers. This paper describes a straightforward active learning heuristic, representative sampling, which explores the clustering structure of 'uncertain' documents and identifies the representative samples to query the user opinions, for the purpose of speeding up the convergence of Support Vector Machine (SVM) classifiers. Compared with other active learning algorithms, the proposed representative sampling explicitly addresses the problem of selecting more than one unlabeled documents. In an empirical study we compared representative sampling both with random sampling and with SVM active learning. The results demonstrated that representative sampling offers excellent learning performance with fewer labeled documents and thus can reduce human efforts in text classification tasks.}, } @inProceedings{Xue03, author = {Dejun Xue and Maosong Sun}, title = {Chinese text categorization based on the binary weighting model with non-binary smoothing}, booktitle = {Proceedings of ECIR-03, 25th European Conference on Information Retrieval}, publisher = {Springer Verlag}, editor = {Fabrizio Sebastiani}, address = {Pisa, IT}, year = {2003}, pages = {408--419}, url = {http://link.springer.de/link/service/series/0558/papers/2633/26330408.pdf}, abstract = {In Text Categorization (TC) based on the vector space model, feature weighting is vital for the categorization effectiveness. Various non-binary weighting schemes are widely used for this purpose. By emphasizing the category discrimination capability of features, the paper firstly puts forward a new weighting scheme TF*IDF*IG. Upon the fact that refined statistics may have more chance to meet sparse data problem, we re-evaluate the role of the Binary Weighting Model (BWM) in TC for further consideration. As a consequence, a novel approach named the Binary Weighting Model with Non-Binary Smoothing (BWM-NBS) is then proposed so as to overcome the drawback of BWM. A TC system for Chinese texts using words as features is implemented. Experiments on a large-scale Chinese document collection with 71,674 texts show that the F1 metric of categorization performance of BWM-NBS gets to 94.9\% in the best case, which is 26.4\% higher than that of TF*IDF, 19.1\% higher than that of TF*IDF*IG, and 5.8\% higher than that of BWM under the same condition. Moreover, BWM-NBS exhibits the strong stability in categorization performance.}, } @inProceedings{Xue04, author = {Xue, Dejun and Sun, Maosong}, title = {Eliminating High-Degree Biased Character Bigrams for Dimensionality Reduction in Chinese Text Categorization}, booktitle = {Proceedings of ECIR-04, 26th European Conference on Information Retrieval Research}, editor = {Sharon McDonald and John Tait}, year = {2004}, address = {Sunderland, UK}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2997}, pages = {197--208}, url = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2997&spage=197}, abstract = {High dimensionality of feature space is a main obstacle for Text Categorization (TC). In a candidate feature set consisting of Chinese character bigrams, there exist a number of bigrams which are high-degree biased according to character frequencies. Usually, these bigrams are likely to survive for their strength of discriminating documents after the process of feature selection. However, most of them are useless for document categorization because of the weakness in representing document contents. The paper firstly defines a criterion to identify the high-degree biased Chinese bigrams. Then, two schemes called s-BR1 and s-BR2 are proposed to deal with these bigrams: the former directly eliminates them from the feature set whereas the latter replaces them with the corresponding significant characters involved. Experimental results show that the high-degree biased bigrams should be eliminated from the feature set, and the s-BR1 scheme is quite effective for further dimensionality reduction in Chinese text categorization, after a feature selection process with a Chi-CIG score function.}, } @inProceedings{Xue04a, author = {Xue, Dejun and Sun, Maosong}, title = {Raising High-Degree Overlapped Character Bigrams into Trigrams for Dimensionality Reduction in Chinese Text Categorization}, booktitle = {Proceedings of CICLING-04, 5th International Conference on Computational Linguistics and Intelligent Text Processing}, year = {2004}, editor = {Alexander F. Gelbukh}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Seoul, KO}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2945}, pages = {584--595}, url = {}, abstract = {}, } @inProceedings{Yamazaki97, author = {Takefumi Yamazaki and Ido Dagan}, title = {Mistake-driven Learning with Thesaurus for Text Categorization}, booktitle = {Proceedings of NLPRS-97, the Natural Language Processing Pacific Rim Symposium}, editor = {}, publisher = {}, address = {Phuket, TH}, pages = {369--374}, year = {1997}, url = {ftp://www.links.nectec.or.th/pub/NLPRS/paper/dana4r.ps.gz}, abstract = {This paper extends the mistake-driven learner WINNOW to better utilize thesauri for text categorization. In our method not only words but also semantic categories given by the thesaurus are used as features in a classifier. New filtering and disambiguation methods are used as pre-processing to solve the problems caused by the use of the thesaurus. In order to verify our methods, we test a large body of tagged Japanese newspaper articles created by RWCP. Experimental results show that WINNOW with thesauri attains high accuracy and that the proposed filtering and disambiguation methods also contribute to the improved accuracy.}, } @inProceedings{Yang00, author = {Yiming Yang and Thomas Ault and Thomas Pierce and Charles W. Lattimer}, title = {Improving text categorization methods for event tracking}, booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on Research and Development in Information Retrieval}, editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong}, publisher = {ACM Press, New York, US}, address = {Athens, GR}, year = {2000}, pages = {65--72}, url = {http://www.cs.cmu.edu/~yiming/papers.yy/sigir00.ps}, abstract = {Automated tracking of events from chronologically ordered document streams is a new challenge for statistical text classification. Existing learning techniques must be adapted or improved in order to effectively handle difficult situations where the number of positive training instances per event is extremely small, the majority of training documents are unlabelled, and most of the events have a short duration in time. We adapted several supervised text categorization methods, specifically several new variants of the k-Nearest Neighbor (kNN) algorithm and a Rocchio approach, to track events. All of these methods showed significant improvement (up to 71\% reduction in weighted error rates) over the performance of the original kNN algorithm on TDT benchmark collections, making kNN among the top-performing systems in the recent TDT3 official evaluation. Furthermore, by combining these methods, we significantly reduced the variance in performance of our event tracking system over different data collections, suggesting a robust solution for parameter optimization.}, } @inProceedings{Yang00a, author = {Yiming Yang and Thomas Ault and Thomas Pierce}, title = {Combining multiple learning strategies for effective cross-validation}, booktitle = {Proceedings of ICML-00, 17th International Conference on Machine Learning}, editor = {Pat Langley}, year = {2000}, address = {Stanford, US}, pages = {1167--1182}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cmu.edu/~yiming/papers.yy/icml00.ps.gz}, abstract = {Parameter tuning through cross-validation becomes very difficult when the validation set contains no or only a few examples of the classes in the evaluation set. We address this open challenge by using a combination of classifiers with different performance characteristics to effectively reduce the performance variance on average of the overall system across all classes, including those not seen before. This approach allows us to tune the combination system on available but less-representative validation data and obtain smaller performance degradation of this system on the evaluation data than using a single-method classifier alone. We tested this approach by applying k-Nearest Neighbor, Rocchio and Language Modeling classifiers and their combination to the event tracking problem in the Topic Detection and Tracking (TDT) domain, where new classes (events) are created constantly over time, and representative validation sets for new classes are often difficult to obtain on time. When parameters tuned on an early benchmark TDT corpus were evaluated on a later TDT benchmark corpus with no overlapping events, we observed a 38-65\% reduction in tracking cost (a weighted combination of errors) by the combined system over the individual methods evaluated under the same conditions, strongly suggesting the robustness of this approach as a solution for improving cross-class performance consistency of statistical classifiers when standard cross-validation fails due to the lack of representative validation sets.}, } @inProceedings{Yang00b, author = {Hsin-Chang Yang and Chung-Hong Lee}, title = {Automatic category generation for text documents by self-organizing maps}, booktitle = {Proceedings of IJCNN-00, 11th International Joint Conference on Neural Networks}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, editor = {Amari, Shun-Ichi and Giles, C. Lee and Gori, Marco and Piuri, Vincenzo}, year = {2000}, address = {Como, IT}, volume = {3}, pages = {581--586}, url = {http://dlib.computer.org/conferen/ijcnn/0619/pdf/06193581.pdf}, abstract = {One important task for text data mining is automatic text categorization, which assigns a text document to some predefined category according to their correlations. Traditionally, these categories as well as the correlations among them are determined bp human experts. In this paper, we devised a novel approach to automatically generate categories. The self-organizing map model is used to generate two maps, namely the word cluster map and the document cluster map, in which a neuron represents a cluster of words and documents respectively. Our approach is to analyze the document cluster map to find centroids of some super-clusters. We also devised a method to select the category term from the word cluster map. The hierarchical structure of categories may be generated by recursively applying the same method. Text categorization is the natural consequence of such automatic category generation process.}, } @inProceedings{Yang00c, author = {Hsin-Chang Yang and Chung-Hong Lee}, title = {Automatic category structure generation and categorization of Chinese text documents}, booktitle = {Proceedings of PKDD-00, 4th European Conference on Principles of Data Mining and Knowledge Discovery}, editor = {Djamel A. Zighed and Jan Komorowski and Jan Zytkow}, publisher = {Springer Verlag, Heidelberg, DE}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1910}, year = {2000}, address = {Lyon, FR}, pages = {581--586}, url = {http://link.springer.de/link/service/series/0558/papers/1910/19100673.pdf}, abstract = {Recently knowledge discovery and data mining in unstructured or semi-structured texts (text mining) has attracted lots of attention from both commercial and research fields. One aspect of text mining is automatic text categorization, which assigns a text document to some predefined category according to the correlation between the document and the category. Traditionally, the categories are arranged in hierarchical manner to achieve effective searching and indexing, as well as easy comprehension for humans. The determination of categories and their hierarchical structures were most done by human experts. The authors developed an approach to automatically generate categories and reveal the hierarchical structure among them. We also used the generated structure to categorize text documents. The document collection is trained by a self-organizing map to form two feature maps. We then analyzed the two maps to obtain the categories and the structure among them. Although the corpus contains documents written in Chinese, the proposed approach can be applied to documents written in any language and such documents can be transformed into a list of separated terms.}, } @inProceedings{Yang01, author = {Yiming Yang}, title = {A Study on Thresholding Strategies for Text Categorization}, booktitle = {Proceedings of SIGIR-01, 24th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin Zobel}, publisher = {ACM Press, New York, US}, address = {New Orleans, US}, year = {2001}, pages = {137--145}, url = {http://www.cs.cmu.edu/~yiming/papers.yy/sigir01.ps.gz}, abstract = {Thresholding strategies in automated text categorization are an underexplored area of research. This paper presents an examination of the effect of thresholding strategies on the performance of a classifier under various conditions. Using k-Nearest Neighbor (kNN) as the classifier and five evaluation benchmark collections as the testbets, three common thresholding methods were investigated, including rank-based thresholding (RCut), proportion-based assignments (PCut) and score-based local optimization (SCut); in addition, new variants of these methods are proposed to overcome significant problems in the existing approaches. Experimental results show that the choice of thresholding strategy can significantly influence the performance of kNN, and that the "optimal" strategy may vary by application. SCut is potentially better for fine-tuning but risks overfitting. PCut copes better with rare categories and exhibits a smoother trade-off in recall versus precision, but is not suitable for online decision making. RCut is most natural for online response but is too coarse-grained for global or local optimization. RTCut, a new method combining the strength of category ranking and scoring, outperforms both PCut and RCut significantly.}, } @article{Yang02, author = {Yiming Yang and Se{\'{a}}n Slattery and Rayid Ghani}, title = {A Study of Approaches to Hypertext Categorization}, journal = {Journal of Intelligent Information Systems}, year = {2002}, note = {Special Issue on Automated Text Categorization}, volume = {18}, number = {2/3}, pages = {219--241}, url = {http://www.wkap.nl/article.pdf?391248}, abstract = {Hypertext poses new research challenges for text classification. Hyperlinks, HTML tags, category labels distributed over linked documents, and meta data extracted from related Web sites all provide rich information for classifying hypertext documents. How to appropriately represent that information and automatically learn statistical patterns for solving hypertext classification problems is an open question. This paper seeks a principled approach to providing the answers. Specifically, we define five {\em hypertext regularities} which may (or may not) hold in a particular application domain, and whose presence (or absence) may significantly influence the optimal design of a classifier. Using three hypertext datasets and three well-known learning algorithms (Naive Bayes, Nearest Neighbor, and First Order Inductive Learner), we examine these regularities in different domains, and compare alternative ways to exploit them. Our results show that the identification of hypertext regularities in the data and the selection of appropriate representations for hypertext in particular domains are crucial, but seldom obvious, in real-world problems. We find that adding the words in the linked neighborhood to the page having those links (both inlinks and outlinks) were helpful for all our classifiers on one data set, but more harmful than helpful for two out of the three classifiers on the remaining datasets. We also observed that extracting meta data from related Web sites was extremely useful for improving classification accuracy in some of those domains. Finally, the relative performance of the classifiers being tested provided insights into their strengths and limitations for solving classification problems involving diverse and often noisy Web pages.}, } @inProceedings{Yang03, author = {Yiming Yang and Jian Zhang and Bryan Kisiel}, title = {A scalability analysis of classifiers in text categorization}, booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on Research and Development in Information Retrieval}, editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David Hawking and Alan Smeaton}, publisher = {ACM Press, New York, US}, address = {Toronto, CA}, year = {2003}, pages = {96--103}, url = {http://doi.acm.org/10.1145/860435.860455}, abstract = {Real-world applications of text categorization often require a system to deal with tens of thousands of categories defined over a large taxonomy. This paper addresses the problem with respect to a set of popular algorithms in text categorization, including Support Vector Machines, k-nearest neighbor, ridge regression, linear least square fit and logistic regression. By providing a formal analysis of the computational complexity of each classification method, followed by an investigation on the usage of different classifiers in a hierarchical setting of categorization, we show how the scalability of a method depends on the topology of the hierarchy and the category distributions. In addition, we are able to obtain tight bounds for the complexities by using the power law to approximate category distributions over a hierarchy. Experiments with kNN and SVM classifiers on the OHSUMED corpus are reported on, as concrete examples.}, } @inProceedings{Yang93, author = {Yiming Yang and Christopher G. Chute}, title = {An application of {Least Squares Fit} mapping to text information retrieval}, booktitle = {Proceedings of SIGIR-93, 16th ACM International Conference on Research and Development in Information Retrieval}, editor = {Robert Korfhage and Edie Rasmussen and Peter Willett}, publisher = {ACM Press, New York, US}, address = {Pittsburgh, US}, pages = {281--290}, year = {1993}, note = {An extended version appears as~\cite{Yang94}}, url = {http://www.acm.org/pubs/articles/proceedings/ir/160688/p281-yang/p281-yang.pdf}, abstract = {This paper describes a unique example-based mapping method for document retrieval. We discovered that the knowledge about relevance among queries and documents can be used to obtain empirical connections between query terms and the canonical concepts which are used for indexing the content of documents. These connections do not depend on whether there are shared terms among the queries and documents; therefore, they are especially effective for a mapping from queries to the documents where the concepts are relevant but the terms used by article authors happen to be different from the terms of database users. We employ a Linear Least Squares Fit (LLSF) technique to compute such connections from a collection of queries and documents where the relevance is assigned by humans, and then use these connections in the retrieval of documents where the relevance is unknown. We tested this method on both retrieval and indexing with a set of MEDLINE documents which has been used by other information retrieval systems for evaluations. The effectiveness of the LLSF mapping and the significant improvement over alternative approaches was evident in the tests.}, } @article{Yang94, author = {Yiming Yang and Christopher G. Chute}, title = {An example-based mapping method for text categorization and retrieval}, journal = {ACM Transactions on Information Systems}, year = {1994}, number = {3}, volume = {12}, pages = {252--277}, url = {http://www.acm.org/pubs/articles/journals/tois/1994-12-3/p252-yang/p252-yang.pdf}, abstract = {A unified model for text categorization and text retrieval is introduced. We use a training set of manually categorized documents to learn word-category associations, and use these associations to predict the categories of arbitrary documents. Similarly, we use a training set of queries and their related documents to obtain empirical associations between query words and indexing terms of documents, and use these associations to predict the related documents of arbitrary queries. A linear least squares fit (LLSF) technique is employed to estimate the likelihood of these associations. Document collections from the MEDLINE database and Mayo patient records are used for studies on the effectiveness of our approach, and on how much the effectiveness depends on the choices of training data, indexing language, word-weighting scheme, and morphological canonicalization. Alternative methods are also tested on these data collections for comparison. It is evident that the LLSF approach uses the relevance information effectively within human decisions of categorization and retrieval, and achieves a semantic mapping of free texts to their representations in an indexing language. Such a semantic mapping leads to a significant improvement in categorization and retrieval, compared to alternative approaches.}, } @inProceedings{Yang94a, author = {Yiming Yang}, title = {Expert network: effective and efficient learning from human decisions in text categorisation and retrieval}, booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on Research and Development in Information Retrieval}, editor = {W. Bruce Croft and Van Rijsbergen, Cornelis J.}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Dublin, IE}, pages = {13--22}, year = {1994}, url = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p13-yang/p13-yang.pdf}, abstract = {Expert Network (ExpNet) is our approach to automatic categorization and retrieval of natural language texts. We use a training set of texts with expert assigned categories to construct a network which approximately reflects the conditional probabilities of categories given a text. The input nodes of the network are words in the training texts, the nodes on the intermediate level are the training texts, and the output nodes are categories. The links between nodes are computed based on statistics of the word distribution and the category distribution over the training set. ExpNet is used for relevance ranking of candidate categories of an arbitrary text in the case of text categorization, and for relevance ranking of documents via categories in the case of text retrieval. We have evaluated ExpNet in categorization and retrieval on a document collection of the MEDLINE database, and observed a performance in recall and precision comparable to the Linear Least Squares Fit (LLSF) mapping method, and significantly better than other methods tested. Computationally, ExpNet has an O(N log N) time complexity which is much more efficient than the cubic complexity of the LLSF method. The simplicity of the model, the high recall precision rates, and the efficient computation together make ExpNet preferable as a practical solution for real world applications.}, } @inProceedings{Yang95, author = {Yiming Yang}, title = {Noise reduction in a statistical approach to text categorization}, booktitle = {Proceedings of SIGIR-95, 18th ACM International Conference on Research and Development in Information Retrieval}, editor = {Edward A. Fox and Peter Ingwersen and Raya Fidel}, publisher = {ACM Press, New York, US}, year = {1995}, address = {Seattle, US}, pages = {256--263}, url = {http://www.cs.cmu.edu/~yiming/papers.yy/sigir95.ps}, abstract = {The paper studies noise reduction for computational efficiency improvements in a statistical learning method for text categorization, the linear least squares fit (LLSF) mapping. Multiple noise reduction strategies are proposed and evaluated, including: an aggressive removal of ``noninformative words'' from texts before training; the use of a truncated singular value decomposition to cut off noisy ``latent semantic structures'' during training; the elimination of noninfluential components in the LLSF solution (a word concept association matrix) after training. Text collections in different domains were used for evaluation. Significant improvements in computational efficiency without losing categorization accuracy were evident in the testing results.}, } @article{Yang96, author = {Yiming Yang and John W. Wilbur}, title = {Using corpus statistics to remove redundant words in text categorization}, journal = {Journal of the American Society for Information Science}, year = {1996}, volume = {47}, number = {5}, pages = {357--369}, url = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=57757&PLACEBO=IE.pdf}, abstract = {This article studies aggressive word removal in text categorization to reduce the noise in free texts and to enhance the computational efficiency of categorization. We use a novel stop word identification method to automatically generate domain-specific stoplists which are much larger than a conventional domain-independent stoplist. In our tests with three categorization methods on text collections from different domains/applications, significant numbers of words were removed without sacrificing categorization effectiveness. In the test of the Expert Network method on CACM documents, for example, an 87\% removal of unique words reduced the vocabulary of documents from 8,002 distinct words to 1,045 words, which resulted in a 63\% time saving and a 74\% memory saving in the computation of category ranking, with a 10\% precision improvement, on average, over not using word removal. It is evident in this study that automated word removal based on corpus statistics has a practical and significant impact on the computational tractability of categorization methods in large databases.}, } @inProceedings{Yang96a, author = {Yiming Yang}, title = {An evaluation of statistical approaches to MEDLINE indexing}, booktitle = {Proceedings of AMIA-96, Fall Symposium of the American Medical Informatics Association}, editor = {James J. Cimino}, publisher = {Hanley and Belfus}, year = {1996}, address = {Washington, US}, pages = {358--362}, url = {http://www.cs.cmu.edu/afs/cs/user/yiming/www/courses/bibliography/papers/scamc96.ps}, abstract = {Whether or not high accuracy classification methods can be scaled to large applications is crucial for the ultimate usefulness of such methods in text categorization. This paper applies two statistical learning algorithms, the Linear Least Squares Fit (LLSF) mapping and a Nearest Neighbor classifier named ExpNet, to a large collection of MEDLINE documents. With the use of suitable dimensionality reduction techniques and efficient algorithms, both LLSF and ExpNet successfully scaled to this very large problem with a result significantly outperforming word-matching and other automatic learning methods applied to the same corpus.}, } @article{Yang96b, author = {Yiming Yang and John W. Wilbur}, title = {An analysis of statistical term strength and its use in the indexing and retrieval of molecular biology texts}, journal = {Computers in Biology and Medicine}, year = {1996}, volume = {26}, number = {3}, pages = {209--222}, url = {}, abstract = {The biological literature presents a difficult challenge to information processing in its complexity, diversity, and in its sheer volume. Much of the diversity resides in its technical terminology, which has also become voluminous. In an effort to deal more effectively with this large vocabulary and improve information processing, a method of focus has been developed which allows one to classify terms based on a measure of their importance in describing the content of the documents in which they occur. The measurement is called the strength of a term and is a measure of how strongly the term`s occurrences correlate with the subjects of documents in the database. If term occurrences are random then there will be no correlation and the strength will be zero, but if for any subject, the term is either always present or never present its strength will be one. We give here a new, information theoretical interpretation of term strength, review some of its uses in focusing the processing of documents for information retrieval and describe new results obtained in document categorization.}, } @inProceedings{Yang97, author = {Yiming Yang and Jan O. Pedersen}, title = {A comparative study on feature selection in text categorization}, booktitle = {Proceedings of ICML-97, 14th International Conference on Machine Learning}, editor = {Douglas H. Fisher}, year = {1997}, address = {Nashville, US}, pages = {412--420}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {http://www.cs.cmu.edu/~yiming/papers.yy/ml97.ps}, abstract = {This paper is a comparative study of feature selection methods in statistical learning of text categorization. The focus is on aggressive dimensionality reduction. Five methods were evaluated, including term selection based on document frequency (DF), information gain (IG), mutual information (MI), a 2 -test (CHI), and term strength (TS). We found IG and CHI most effective in our experiments. Using IG thresholding with a k-nearest neighbor classifier on the Reuters corpus, removal of up to 98\% removal of unique terms actually yielded an improved classification accuracy (measured by average precision). DF thresholding performed similarly. Indeed we found strong correlations between the DF, IG and CHI values of a term. This suggests that DF thresholding, the simplest method with the lowest cost in computation, can be reliably used instead of IG or CHI when the computation of these measures are too expensive. TS compares favorably with the other methods with up to 50\% vocabulary reduction but is not competitive at higher vocabulary reduction levels. In contrast, MI had relatively poor performance due to its bias towards favoring rare terms, and its sensitivity to probability estimation errors.}, } @inProceedings{Yang99, author = {Yiming Yang and Xin Liu}, title = {A re-examination of text categorization methods}, booktitle = {Proceedings of SIGIR-99, 22nd ACM International Conference on Research and Development in Information Retrieval}, editor = {Marti A. Hearst and Fredric Gey and Richard Tong}, publisher = {ACM Press, New York, US}, address = {Berkeley, US}, year = {1999}, pages = {42--49}, url = {http://www.cs.cmu.edu/~yiming/papers.yy/sigir99.ps}, abstract = {This paper reports a controlled study with statistical significance tests on five text categorization methods: the Support Vector Machines (SVM), a k-Nearest Neighbor (kNN) classifier, a neural network (NNet) approach, the Linear Least-squares Fit (LLSF) mapping and a Naive Bayes (NB) classifier. We focus on the robustness of these methods in dealing with a skewed category distribution, and their performance as function of the training-set category frequency. Our results show that SVM, kNN and LLSF significantly outperform NNet and NB when the number of positive training instances per category are small (less than ten), and that all the methods perform comparably when the categories are sufficiently common (over 300 instances).}, } @article{Yang99a, author = {Yiming Yang}, title = {An evaluation of statistical approaches to text categorization}, journal = {Information Retrieval}, year = {1999}, pages = {69--90}, volume = {1}, number = {1/2}, url = {http://www.cs.cmu.edu/~yiming/papers.yy/irj99.ps}, abstract = {This paper focuses on a comparative evaluation of a wide-range of text categorization methods, including previously published results on the Reuters corpus and new results of additional experiments. A controlled study using three classifiers, kNN, LLSF and WORD, was conducted to examine the impact of configuration variations in five versions of Reuters on the observed performance of classifiers. Analysis and empirical evidence suggest that the evaluation results on some versions of Reuters were significantly affected by the inclusion of a large portion of unlabelled documents, making those results difficult to interpret and leading to considerable confusions in the literature. Using the results evaluated on the other versions of Reuters which exclude the unlabelled documents, the performance of twelve methods are compared directly or indirectly. For indirect comparisons, kNN, LLSF and WORD were used as baselines, since they were evaluated on all versions of Reuters that exclude the unlabelled documents. As a global observation, kNN, LLSF and a neural network method had the best performance; except for a naive Bayes approach, the other learning algorithms also performed relatively well.}, } @inProceedings{Yavuz98, author = {Yavuz, Tuba and G{\"u}venir, H. Altay}, title = {Application of k-nearest neighbor on feature projections classifier to text categorization}, booktitle = {Proceedings of ISCIS-98, 13th International Symposium on Computer and Information Sciences}, editor = {U. Gudukbay and T. Dayar and A. Gursoy and Erol Gelenbe}, publisher = {IOS Press, Amsterdam, NL}, year = {1998}, address = {Ankara, TR}, pages = {135--142}, url = {ftp://ftp.cs.bilkent.edu.tr/pub/tech-reports/1998/BU-CEIS-9809.ps.z}, abstract = {This paper presents the results of the application of an instance-based learning algorithm k-nearest neighbor method on feature projections (k-NNFP) to text categorization and compares it with k-nearest neighbor classifier (k-NN). K-NNFP is similar to k-NN except it finds the nearest neighbors according to each feature separately. Then it combines these predictions using a majority voting. This property causes k-NNFP to eliminate possible adverse effects of irrelevant features on the classification accuracy. Experimental evidence indicates that k-NNFP is superior to k-NN in terms of classification accuracy in the presence of irrelevant features in many real world domains.}, } @inProceedings{Yi00, author = {Jeonghee Yi and Neel Sundaresan}, title = {A classifier for semi-structured documents}, booktitle = {Proceedings of KDD-00, 6th ACM International Conference on Knowledge Discovery and Data Mining}, editor = {}, publisher = {ACM Press, New York, US}, address = {Boston, US}, year = {2000}, pages = {340--344}, url = {http://doi.acm.org/10.1145/347090.347164}, abstract = {In this paper, we describe a novel text classifier that can effectively cope with structured documents. We report experiments that compare its performance with that of a well-known probabilistic classifier. Our novel classifier can take advantage of the information in the structure of document that conventional, purely term-based classifiers ignore. Conventional classifiers are mostly based on the vector space model of document, which views a document simply as an n-dimensional vector of terms. To retain the information in the structure, we have developed a structured vector model, which represents a document with a structured vector, whose elements can be either terms or other structured vectors. With this extended model, we also have improved the well-known probabilistic classification method based on the Bernoulli document generation model. Our classifier based on these improvements performes significantly better on pre-classified samples from the web and the US Patent database than the usual classifiers.}, } @inProceedings{Yu03a, author = {Hwanjo Yu and ChengXiang Zhai and Jiawei Han}, title = {Text classification from positive and unlabeled documents}, booktitle = {Proceedings of CIKM-03, 12th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {New Orleans, US}, pages = {232--239}, url = {http://doi.acm.org/10.1145/956863.956909}, abstract = {Most existing studies of text classification assume that the training data are completely labeled. In reality, however, many information retrieval problems can be more accurately described as learning a binary classifier from a set of incompletely labeled examples, where we typically have a small number of labeled positive examples and a very large number of unlabeled examples. In this paper, we study such a problem of performing Text Classification WithOut labeled Negative data TC-WON). In this paper, we explore an efficient extension of the standard Support Vector Machine (SVM) approach, called SVMC (Support Vector Mapping Convergence) [17]for the TC-WON tasks. Our analyses show that when the positive training data is not too under-sampled, SVMC significantly outperforms other methods because SVMC basically exploits the natural "gap" between positive and negative documents in the feature space, which eventually corresponds to improving the generalization performance. In the text domain there are likely to exist many gaps in the feature space because a document is usually mapped to a sparse and high dimensional feature space. However, as the number of positive training data decreases, the boundary of SVMC starts overfitting at some point and end up generating very poor results.This is because when the positive training data is too few, the boundary over-iterates and trespasses the natural gaps between positive and negative class in the feature space and thus ends up fitting tightly around the few positive training data.}, } @inProceedings{Yu98, author = {Kwok L. Yu and Wai Lam}, title = {A New On-Line Learning Algorithm for Adaptive Text Filtering}, booktitle = {Proceedings of CIKM-98, 7th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Georges Gardarin and James C. French and Niki Pissinou and Kia Makki and Luc Bouganim}, year = {1998}, address = {Bethesda, US}, pages = {156--160}, url = {http://www.acm.org/pubs/articles/proceedings/cikm/288627/p156-yu/p156-yu.pdf}, abstract = {Much previous work on text filtering is developed for batch filtering. They may not perform effectively in adaptive text filtering which is a more realistic problem. We propose a new on-line learning algorithm, known as the ATF (Adaptive Text Filtering) algorithm, to tackle the adaptive filtering problem. Our approach maintains a pool of selective terms with potentially high predictive power. The documents are retrieved by considering both the predicted relevance and its value as a training observation. The experimental result on the FBIS document corpus shows that the ATF algorithm outperforms the pure EG (Exponentiated-gradient) algorithm.}, } @inProceedings{Yu99, author = {Edmund S. Yu and Elizabeth D. Liddy}, title = {Feature selection in text categorization using the Baldwin effect Networks}, booktitle = {Proceedings of IJCNN-99, 10th International Joint Conference on Neural Networks}, editor = {}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, year = {1999}, address = {Washington, DC}, pages = {2924--2927}, url = {}, abstract = {Text categorization is the problem of automatically assigning predefined categories to natural language texts. A major difficulty of this problem stems from the high dimensionality of its feature space. Reducing the dimensionality, or selecting a good subset of features, without sacrificing accuracy, is of great importance for neural networks to be successfully applied to the area. In this paper, we propose a neuro-genetic approach to feature selection in text categorization. Candidate feature subsets are evaluated by using three-layer feedforward neural networks. The Baldwin effect concerns the tradeoffs between learning and evolution. It is used in our research to guide and improve the GA-based evolution of the feature subsets. Experimental results show that our neuro-genetic algorithm is able to perform as well as, if not better than, the best results of neural networks to date, while using fewer input features.}, } @inProceedings{Zaiane02, author = {Osmar R. Za{\"{\i}}ane and Maria-Luiza Antonie}, title = {Classifying text documents by associating terms with text categories}, booktitle = {Proceedings of the 13th Australasian Conference on Database Technologies}, publisher = {ACM Press, New York, US}, year = {2002}, pages = {215--222}, address = {Melbourne, AU}, volume = {5}, url = {}, note = {This paper has also been published in \emph{Australian Computer Science Communications}, 24(2), 2002.}, abstract = {Automatic text categorization has always been an important application and research topic since the inception of digital documents. Today, text categorization is a necessity due to the very large amount of text documents that we have to deal with daily. Many techniques and algorithms for automatic text categorization have been devised and proposed in the literature. However, there is still much room for improving the effectiveness of these classifiers, and new models need to be examined. We propose herein a new approach for automatic text categorization. This paper explores the use of association rule mining in building a text categorization system and proposes a new fast algorithm for building a text classifier. Our approach has the advantage of a very fast training phase, and the rules of the classifier generated are easy to understand and manually tuneable. Our investigation leads to conclude that association rule mining is a good and promising strategy for efficient automatic text categorization.}, } @inProceedings{Zelikovitz00, author = {Sarah Zelikovitz and Haym Hirsh}, title = {Improving Short Text Classification Using Unlabeled Background Knowledge}, booktitle = {Proceedings of ICML-00, 17th International Conference on Machine Learning}, editor = {Pat Langley}, year = {2000}, address = {Stanford, US}, pages = {1183--1190}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {ftp://ftp.cs.rutgers.edu/pub/zelikovi/bg1.ps}, abstract = {We describe a method for improving the classification of short text strings using a combination of labeled training data plus a secondary corpus of unlabeled but related longer documents. We show that such unlabeled background knowledge can greatly decrease error rates, particularly if the number of examples or the size of the strings in the training set is small. This is particularly useful when labeling text is a labor-intensive job and when there is a large amount of information available about a particular problem on the World Wide Web. Our approach views the task as one of information integration using WHIRL, a tool that combines database functionalities with techniques from the information retrieval literature.}, } @inProceedings{Zelikovitz01, author = {Sarah Zelikovitz and Haym Hirsh}, title = {Using LSI for Text Classification in the Presence of Background Text}, booktitle = {Proceedings of CIKM-01, 10th ACM International Conference on Information and Knowledge Management}, publisher = {ACM Press, New York, US}, editor = {Henrique Paques and Ling Liu and David Grossman}, year = {2001}, address = {Atlanta, US}, pages = {113--118}, url = {ftp://ftp.cs.rutgers.edu/pub/zelikovi/lsi01.ps}, abstract = {This paper presents work that uses Latent Semantic Indexing (LSI) for text classification. However, in addition to relying on labeled training data, we improve classification accuracy by also using unlabeled data and other forms of available ``background" text in the classification process. Rather than performing LSI's singular value decomposition (SVD) process solely on the training data, we instead use an expanded term-by-document matrix that includes both the labeled data as well as any available and relevant background text. We report the performance of this approach on data sets both with and without the inclusion of the background text, and compare our work to other efforts that can incorporate unlabeled data and other background text in the classification process.}, } @inProceedings{Zelikovitz02, author = {Sarah Zelikovitz and Haym Hirsh}, title = {Integrating Background Knowledge into Nearest-Neighbor Text Classification}, pages = {1--5}, url = {}, booktitle = {Proceedings of ECCBR-02, 6th European Conference on Case-Based Reasoning}, editor = {Susan Craw and Alun D. Preece}, publisher = {Springer Verlag, Heidelberg, DE}, year = {2002}, address = {Aberdeen, UK}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2416}, } @inProceedings{Zelikovitz03, author = {Sarah Zelikovitz and Haym Hirsh}, title = {Integrating Background Knowledge Into Text Classification}, pages = {1448--1449}, url = {}, booktitle = {Proceedings of IJCAI-03, 18th International Joint Conference on Artificial Intelligence}, editor = {Georg Gottlob and Toby Walsh}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = {2003}, address = {Acapulco, MX}, } @article{Zhang01, author = {Tong Zhang and Frank J. Oles}, title = {Text Categorization Based on Regularized Linear Classification Methods}, journal = {Information Retrieval}, number = {1}, volume = {4}, pages = {5--31}, year = {2001}, url = {http://www.wkap.nl/article.pdf?335913}, abstract = {A number of linear classification methods such as the linear least squares fit (LLSF), logistic regression, and support vector machines (SVM's) have been applied to text categorization problems. These methods share the similarity by finding hyperplanes that approximately separate a class of document vectors from its complement. However, support vector machines are so far considered special in that they have been demonstrated to achieve the state of the art performance. It is therefore worthwhile to understand whether such good performance is unique to the SVM design, or if it can also be achieved by other linear classification methods. In this paper, we compare a number of known linear classification methods as well as some variants in the framework of regularized linear systems. We will discuss the statistical and numerical properties of these algorithms, with a focus on text categorization. We will also provide some numerical experiments to illustrate these algorithms on a number of datasets.}, } @inProceedings{Zhang03, author = {Dell Zhang and Wee Sun Lee}, title = {Question Classification using Support Vector Machines}, booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on Research and Development in Information Retrieval}, editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David Hawking and Alan Smeaton}, publisher = {ACM Press, New York, US}, address = {Toronto, CA}, year = {2003}, pages = {26--32}, url = {http://doi.acm.org/10.1145/860435.860443}, abstract = {Question classification is very important for question answering. This paper presents our research work on automatic question classification through machine learning approaches. We have experimented with five machine learning algorithms: Nearest Neighbors (NN), Naive Bayes (NB), Decision Tree (DT), Sparse Network of Winnows (SNoW), and Support Vector Machines (SVM) using two kinds of features: bag-of-words and bag-ofngrams. The experiment results show that with only surface text features the SVM outperforms the other four methods for this task. Further, we propose to use a special kernel function called the tree kernel to enable the SVM to take advantage of the syntactic structures of questions. We describe how the tree kernel can be computed efficiently by dynamic programming. The performance of our approach is promising, when tested on the questions from the TREC QA track.}, } @inProceedings{Zhang03a, author = {Jian Zhang and Rong Jin and Yiming Yang and Alex Hauptmann}, title = {Modified Logistic Regression: An Approximation to SVM and Its Applications in Large-Scale Text Categorization}, booktitle = {Proceedings of ICML-03, 20th International Conference on Machine Learning}, editor = {}, year = {2003}, address = {Washington, DC}, pages = {}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, url = {}, abstract = {}, } @inProceedings{Zhang03b, author = {Jian Zhang and Yiming Yang}, title = {Robustness of regularized linear classification methods in text categorization}, booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on Research and Development in Information Retrieval}, editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David Hawking and Alan Smeaton}, publisher = {ACM Press, New York, US}, address = {Toronto, CA}, year = {2003}, pages = {190--197}, url = {http://doi.acm.org/10.1145/860435.860471}, abstract = {Real-world applications often require the classification of documents under situations of small number of features, mis-labeled documents and rare positive examples. This paper investigates the robustness of three regularized linear classification methods (SVM, ridge regression and logistic regression) under above situations. We compare these methods in terms of their loss functions and score distributions, and establish the connection between their optimization problems and generalization error bounds. Several sets of controlled experiments on the Reuters-21578 corpus are conducted to investigate the robustness of these methods. Our results show that ridge regression seems to be the most promising candidate for rare class problems.}, } @inProceedings{Zhdanova02, author = {Anna V. Zhdanova and Denis V. Shishkin}, title = {Classification of Email Queries by Topic: Approach Based on Hierarchically Structured Subject Domain}, booktitle = {Proceedings of IDEAL-02, 3rd International Conference on Intelligent Data Engineering and Automated Learning}, editor = {Hujun Yin and Nigel Allinson and Richard Freeman and John Keane and Simon Hubbard}, publisher = {Springer Verlag, Heidelberg, DE}, address = {Manchester, UK}, year = {2002}, pages = {99--104}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2412}, url = {http://link.springer.de/link/service/series/0558/papers/2412/24120099.pdf}, abstract = {We describe a Classifier of email queries, which executes text categorization by topic. The specifics of our Classifier is that it allows accurate categorization of short messages containing only a few words. This advantage is achieved by executing morphological and semantic analyses of an incoming text. Specifically, the Classifier provides an efficient information extraction and takes the meaning of words into consideration. By using the hierarchically structured subject domain and classification rules, the Classifier's engine assigns an email query to the most relevant category or categories.}, } @article{Zheng04, author = {Zhaohui Zheng and Xiaoyun Wu and Rohini Srihari}, title = {Feature selection for text categorization on imbalanced data}, journal = {SIGKDD Explorations}, year = {2004}, number = {1}, volume = {6}, pages = {80--89}, url = {http://doi.acm.org/10.1145/1007730.1007741}, abstract = {A number of feature selection metrics have been explored in text categorization, among which information gain (IG), chi-square (CHI), correlation coefficient (CC) and odds ratios (OR) are considered most effective. CC and OR are one-sided metrics while IG and CHI are two-sided. Feature selection using one-sided metrics selects the features most indicative of membership only, while feature selection using two-sided metrics implicitly combines the features most indicative of membership (e.g. positive features) and non-membership (e.g. negative features) by ignoring the signs of features. The former never consider the negative features, which are quite valuable, while the latter cannot ensure the optimal combination of the two kinds of features especially on imbalanced data. In this work, we investigate the usefulness of explicit control of that combination within a proposed feature selection framework. Using multinomial naive Bayes and regularized logistic regression as classifiers, our experiments show both great potential and actual merits of explicitly combining positive and negative features in a nearly optimal fashion according to the imbalanced data.}, } @inProceedings{Zhou00, author = {Shuigeng Zhou and Ye Fan and Jiangtao Hua and Fang Yu and Yunfa Hu}, title = {Hierachically Classifying Chinese Web Documents without Dictionary Support and Segmentation Procedure}, booktitle = {Proceedings of WAIM-00, 1st International Conference on Web-Age Information Management}, publisher = {Springer Verlag, Heidelberg, DE}, editor = {Hongjun Lu and Aoying Zhou}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 1846}, year = {2000}, address = {Shanghai, CN}, pages = {215--226}, url = {http://link.springer.de/link/service/series/0558/papers/1846/18460215.pdf}, abstract = {This paper reports a system that hierarchically classifies Chinese web documents without dictionary support and segmentation procedure. In our classifier, Web documents are represented by N-grams (N$\leq 4$) that are easy to be extracted. A boosting machine learning approach is applied to classifying Web Chinese documents that share a topic hierarchy. The open and modularized system architecture makes our classifier be extendible. Experimental results show that our system can effectively and efficiently classify Chinese Web documents.}, } @inProceedings{Zhou02, author = {Shuigeng Zhou and Jihong Guan}, title = {An Approach to Improve Text Classification Efficiency}, booktitle = {Proceedings of ADBIS-02, 6th East-European Conference on Advances in Databases and Information Systems}, publisher = {Springer Verlag, Heidelberg, DE}, editor = {Yannis Manolopoulos and Pavol N{\'a}vrat}, year = {2002}, address = {Bratislava, SK}, pages = {65--79}, url = {http://link.springer.de/link/service/series/0558/papers/2435/24350065.pdf}, abstract = {Text classification is becoming more and more important with the rapid growth of on-line information available. In this paper, we propose an approach to speedup the process of text classification based on pruning the training corpus. Effective algorithm for text corpus pruning is designed. Experiments over real-world text corpus are carried out, which validates the effectiveness and efficiency of the proposed approach. Our approach is especially suitable for applications of on-line text classification.}, } @inProceedings{Zhou02a, author = {Shuigeng Zhou and Jihong Guan}, title = {Chinese Documents Classification Based on N-Grams}, booktitle = {Proceedings of CICLING-02, 3rd International Conference on Computational Linguistics and Intelligent Text Processing}, publisher = {Springer Verlag, Heidelberg, DE}, editor = {Alexander F. Gelbukh}, note = {Published in the ``Lecture Notes in Computer Science'' series, number 2276}, year = {2002}, address = {Mexico City, MX}, pages = {405--414}, url = {http://link.springer.de/link/service/series/0558/papers/2276/22760405.pdf}, abstract = {Traditional Chinese documents classifiers are based on keywords in the documents, which need dictionaries support and efficient segmentation procedures. This paper explores the techniques of utilizing N-gram information to categorize Chinese documents so that the classifier can shake off the burden of large dictionaries and complex segmentation processing, and subsequently be domain and time independent. A Chinese documents classification system following above described techniques is implemented with Naive Bayes, kNN and hierarchical classification methods. Experimental results show that our system can achieve satisfactory performance, which is comparable with other traditional classifiers.}, } @inProceedings{Zhou03, author = {Shuigeng Zhou and Tok Wang Ling and Jihong Guan and Jiangtao Hu and Aoying Zhou}, title = {Fast Text Classification: A Training-Corpus Pruning Based Approach}, booktitle = {Proceedings of DASFAA-03, 8th IEEE International Conference on Database Advanced Systems for Advanced Application}, editor = {}, publisher = {IEEE Computer Society Press, Los Alamitos, US}, year = {2003}, address = {Kyoto, JP}, pages = {127--136}, url = {}, abstract = {}, } @inProceedings{Zu03, author = {Guowei Zu and Wataru Ohyama and Tetsushi Wakabayashi and Fumitaka Kimura}, title = {Accuracy improvement of automatic text classification based on feature transformation}, booktitle = {Proceedings of DOCENG-03, ACM Symposium on Document engineering}, publisher = {ACM Press, New York, US}, editor = {}, year = {2003}, address = {Grenoble, FR}, pages = {118--120}, url = {http://doi.acm.org/10.1145/958220.958242}, abstract = {In this paper, we describe a comparative study on techniques of feature transformation and classification to improve the accuracy of automatic text classification. The normalization to the relative word frequency, the principal component analysis (K-L transformation) and the power transformation were applied to the feature vectors, which were classified by the Euclidean distance, the linear discriminant function, the projection distance, the modified projection distance and the SVM.}, } @inProceedings{Yan:2005:OCFS, author = "Jun Yan and Ning Liu and Benyu Zhang and Shuicheng Yan and Zheng Chen and Qiansheng Cheng and Weiguo Fan and Wei-Ying Ma", title = "{OCFS}: optimal orthogonal centroid feature selection for text categorization", pages = "122--129", booktitle = "Proceedings of the 28th annual international {ACM} {SIGIR} conference on research and development in information retrieval", year = "2005", month = "August", address = "Salvador, Brazil", abstract = "Text categorization is an important research area in many Information Retrieval (IR) applications. To save the storage space and computation time in text categorization, efficient and effective algorithms for reducing the data before analysis are highly desired. Traditional techniques for this purpose can generally be classified into feature extraction and feature selection. Because of efficiency, the latter is more suitable for text data such as web documents. However, many popular feature selection techniques such as Information Gain (IG) and $\chi^2$-test (CHI) are all greedy in nature and thus may not be optimal according to some criterion. Moreover, the performance of these greedy methods may be deteriorated when the reserved data dimension is extremely low. In this paper, we propose an efficient optimal feature selection algorithm by optimizing the objective function of Orthogonal Centroid (OC) subspace learning algorithm in a discrete solution space, called Orthogonal Centroid Feature Selection (OCFS). Experiments on 20 Newsgroups (20NG), Reuters Corpus Volume 1 (RCV1) and Open Directory Project (ODP) data show that OCFS is consistently better than IG and CHI with smaller computation time especially when the reduced dimension is extremely small." } @inProceedings{Seki:2005:ATC, author = "Kazuhiro Seki and Javed Mostafa", title = "An application of text categorization methods to gene ontology annotation", pages = "138--145", booktitle = "Proceedings of the 28th annual international {ACM} {SIGIR} conference on research and development in information retrieval", year = "2005", month = "August", address = "Salvador, Brazil", abstract = "This paper describes an application of IR and text categorization methods to a highly practical problem in biomedicine, specifically, Gene Ontology (GO) annotation. GO annotation is a major activity in most model organism database projects and annotates gene functions using a controlled vocabulary. As a first step toward automatic GO annotation, we aim to assign GO domain codes given a specific gene and an article in which the gene appears, which is one of the task challenges at the TREC 2004 Genomics Track. We approached the task with careful consideration of the specialized terminology and paid special attention to dealing with various forms of gene synonyms, so as to exhaustively locate the occurrences of the target gene. We extracted the words around the gene occurrences and used them to represent the gene for GO domain code annotation. As a classifier, we adopted a variant of k-Nearest Neighbor (kNN) with supervised term weighting schemes to improve the performance, making our method among the top-performing systems in the TREC official evaluation. Moreover, it is demonstrated that our proposed framework is successfully applied to another task of the Genomics Track, showing comparable results to the best performing system." } @inProceedings{Yang:2005:RAF, author = "Yiming Yang and Shinjae Yoo and Jian Zhang and Bryan Kisiel", title = "Robustness of adaptive filtering methods in a cross-benchmark evaluation", pages = "98--105", booktitle = "Proceedings of the 28th annual international {ACM} {SIGIR} conference on research and development in information retrieval", year = "2005", month = "August", address = "Salvador, Brazil", abstract = "This paper reports a cross-benchmark evaluation of regularized logistic regression (LR) and incremental Rocchio for adaptive filtering. Using four corpora from the Topic Detection and Tracking (TDT) forum and the Text Retrieval Conferences (TREC) we evaluated these methods with non-stationary topics at various granularity levels, and measured performance with different utility settings. We found that LR performs strongly and robustly in optimizing T11SU (a TREC utility function) while Rocchio is better for optimizing Ctrk (the TDT tracking cost), a high-recall oriented objective function. Using systematic cross-corpus parameter optimization with both methods, we obtained the best results ever reported on TDT5, TREC10 and TREC11. Relevance feedback on a small portion (0.05~0.2%) of the TDT5 test documents yielded significant performance improvements, measuring up to a 54% reduction in Ctrk and a 20.9% increase in T11SU (with b=0.1), compared to the results of the top-performing system in TDT2004 without relevance feedback information." } @inProceedings{Li:2005:PMR, author = "Zhiwei Li and Bin Wang and Mingjing Li and Wei-Ying Ma", title = "A probabilistic model for retrospective news event detection", pages = "106--113", booktitle = "Proceedings of the 28th annual international {ACM} {SIGIR} conference on research and development in information retrieval", year = "2005", month = "August", address = "Salvador, Brazil", abstract = "Retrospective news event detection (RED) is defined as the discovery of previously unidentified events in historical news corpus. Although both the contents and time information of news articles are helpful to RED, most researches focus on the utilization of the contents of news articles. Few research works have been carried out on finding better usages of time information. In this paper, we do some explorations on both directions based on the following two characteristics of news articles. On the one hand, news articles are always aroused by events; on the other hand, similar articles reporting the same event often redundantly appear on many news sources. The former hints a generative model of news articles, and the latter provides data enriched environments to perform RED. With consideration of these characteristics, we propose a probabilistic model to incorporate both content and time information in a unified framework. This model gives new representations of both news articles and news events. Furthermore, based on this approach, we build an interactive RED system, HISCOVERY, which provides additional functions to present events, Photo Story and Chronicle." } @inProceedings{Zhang:2005:TCK, author = "Dell Zhang and Xi Chen and Lee, Wee Sun", title = "Text classification with kernels on the multinomial manifold", pages = "266-273", booktitle = "Proceedings of the 28th annual international {ACM} {SIGIR} conference on research and development in information retrieval", year = "2005", month = "August", address = "Salvador, Brazil", abstract = "Support Vector Machines (SVMs) have been very successful in text classification. However, the intrinsic geometric structure of text data has been ignored by standard kernels commonly used in SVMs. It is natural to assume that the documents are on the multinomial manifold, which is the simplex of multinomial models furnished with the Riemannian structure induced by the Fisher information metric. We prove that the Negative Geodesic Distance (NGD) on the multinomial manifold is conditionally positive definite (cpd), thus can be used as a kernel in SVMs. Experiments show the NGD kernel on the multinomial manifold to be effective for text classification, significantly outperforming standard kernels on the ambient Euclidean space." } @inProceedings{Zhu:2005:MLC, author = "Shenghuo Zhu and Xiang Ji and Wei Xu and Yihong Gong ", title = "Multi-labelled classification using maximum entropy method", pages = "274--281", booktitle = "Proceedings of the 28th annual international {ACM} {SIGIR} conference on research and development in information retrieval", year = "2005", month = "August", address = "Salvador, Brazil", abstract = "Many classification problems require classifiers to assign each single document into more than one category, which is called multi-labelled classification. The categories in such problems usually are neither conditionally independent from each other nor mutually exclusive, therefore it is not trivial to directly employ state-of-the-art classification algorithms without losing information of relation among categories. In this paper, we explore correlations among categories with maximum entropy method and derive a classification algorithm for multi-labelled documents. Our experiments show that this method significantly outperforms the combination of single label approach." } @InProceedings{Gabrilovich:2005:ODP, author = "Gabrilovich, Evgeniy and Markovitch, Shaul", title = "Feature Generation for Text Categorization Using World Knowledge", pages = "1048--1053", booktitle = "Proceedings of the 19th International Joint Conference on Artificial Intelligence", year = "2005", month = "August", address = "Edinburgh, Scotand", url = "http://www.cs.technion.ac.il/~gabr/papers/fg-tc-ijcai05.pdf", abstract = "We enhance machine learning algorithms for text categorization with generated features based on domain-specific and common-sense knowledge. This knowledge is represented using publicly available ontologies that contain hundreds of thousands of concepts, such as the Open Directory; these ontologies are further enriched by several orders of magnitude through controlled Web crawling. Prior to text categorization, a feature generator analyzes the documents and maps them onto appropriate ontology concepts, which in turn induce a set of generated features that augment the standard bag of words. Feature generation is accomplished through contextual analysis of document text, implicitly performing word sense disambiguation. Coupled with the ability to generalize concepts using the ontology, this approach addresses the two main problems of natural language processing—synonymy and polysemy. Categorizing documents with the aid of knowledge-based features leverages information that cannot be deduced from the documents alone. Experimental results confirm improved performance, breaking through the plateau previously reached in the field." } @InProceedings{Soucy:2005:BTW, author = "Pascal Soucy and Guy Mineau", title = "Beyond {TFIDF} Weighting for Text Categorization in the Vector Space Model", pages = "1130--1135", booktitle = "Proceedings of the 19th International Joint Conference on Artificial Intelligence", year = "2005", month = "August", address = "Edinburgh, Scotand", url = "http://www.ijcai.org/papers/0304.pdf", abstract = "KNN and SVM are two machine learning approaches to Text Categorization (TC) based on the Vector Space Model. In this model, borrowed from Information Retrieval, documents are represented as a vector where each component is associated with a particular word from the vocabulary. Traditionally, each component value is assigned using the information retrieval TFIDF measure. While this weighting method seems very appropriate for IR, it is not clear that it is the best choice for TC problems. Actually, this weighting method does not leverage the information implicitly contained in the categorization task to represent documents. In this paper, we introduce a new weighting method based on statistical estimation of the importance of a word for a specific categorization problem. This method also has the benefit to make feature selection implicit, since useless features for the categorization problem considered get a very small weight. Extensive experiments reported in the paper shows that this new weighting method improves significantly the classification accuracy as measured on many categorization tasks." } @InProceedings{Raghavan:2005:IFS, author = "Hema Raghavan and Omid Madani and Rosie Jones", title = "InterActive Feature Selection", pages = "841--846", booktitle = "Proceedings of the 19th International Joint Conference on Artificial Intelligence", year = "2005", month = "August", address = "Edinburgh, Scotand", url = "http://www.ijcai.org/papers/1401.pdf", abstract = "We study the effects of feature selection and human feedback on features in active learning settings. Our experiments on a variety of text categorization tasks indicate that there is significant potential in improving classifier performance by feature reweighting, beyond that achieved via selective sampling alone (standard active learning) if we have access to an oracle that can point to the important (most predictive) features. Consistent with previous findings, we find that feature selection based on the labeled training set has little effect. But our experiments on human subjects indicate that human feedback on feature relevance can identify a sufficient proportion (65%) of the most relevant features. Furthermore, these experiments show that feature labeling takes much less (about 1/5th) time than document labeling. We propose an algorithm that interleaves labeling features and documents which significantly accelerates active learning." } @InProceedings{Cohen:2005:FSB, author = "Shay Cohen and Eytan Ruppin and Gideon Dror", title = "Feature Selection Based on the Shapley Value", pages = "665--670", booktitle = "Proceedings of the 19th International Joint Conference on Artificial Intelligence", year = "2005", month = "August", address = "Edinburgh, Scotand", url = "http://www.ijcai.org/papers/0763.pdf", abstract = "We present and study the Contribution-Selection algorithm (CSA), a novel algorithm for feature selection. The algorithm is based on the Multiperturbation Shapley Analysis, a framework which relies on game theory to estimate usefulness. The algorithm iteratively estimates the usefulness of features and selects them accordingly, using either forward selection or backward elimination. Empirical comparison with several other existing feature selection methods shows that the backward elimination variant of CSA leads to the most accurate classification results on an array of datasets." } @InProceedings{Rousu:2005:LHM, author = "Juho Rousu and Craig Saunders and Sandor Szedmak and John Shawe-Taylor", title = "Learning Hierarchical Multi-Category Text Classification Models", booktitle = "Proceedings of the Twenty-Second International Conference on Machine Learning", year = "2005", month = "August", address = "Bonn, Germany", url = "http://www.machinelearning.org/proceedings/icml2005/papers/094_Hierarchical_RousuEtAl.pdf", abstract = "We present a kernel-based algorithm for hierarchical text classification where the documents are allowed to belong to more than one category at a time. The classification model is a variant of the Maximum Margin Markov Network framework, where the classification hierarchy is represented as a Markov tree equipped with an exponential family defined on the edges. We present an efficient optimization algorithm based on incremental conditional gradient ascent in single-example subspaces spanned by the marginal dual variables. Experiments show that the algorithm can feasibly optimize training sets of thousands of examples and classification hierarchies consisting of hundreds of nodes. The algorithm's predictive accuracy is competitive with other recently introduced hierarchical multi-category or multilabel classification learning algorithms." } @InProceedings{Keerthi:2005:GLI, author = "S. Sathiya Keerthi", title = "Generalized {LARS} as an Effective Feature Selection Tool for Text Classification With {SVM}s", booktitle = "Proceedings of the Twenty-Second International Conference on Machine Learning", year = "2005", month = "August", address = "Bonn, Germany", url = "http://www.machinelearning.org/proceedings/icml2005/papers/053_GeneralizedLARS_Keerthi.pdf", abstract = "In this paper we generalize the LARS feature selection method to the linear SVM model, derive an efficient algorithm for it, and empirically demonstrate its usefulness as a feature selection tool for text classification." } @InProceedings{Ramakrishnan:2005:MHA, author = "Ganesh Ramakrishnan and Chitrapura, Krishna Prasad and Raghu Krishnapuram and Pushpak Bhattacharyya", title = "A Model for Handling Approximate, Noisy or Incomplete Labeling in Text Classification", booktitle = "Proceedings of the Twenty-Second International Conference on Machine Learning", year = "2005", month = "August", address = "Bonn, Germany", url = "http://www.machinelearning.org/proceedings/icml2005/papers/086_HandlingApproximate_RamakrishanEtAl.pdf", abstract = "We introduce a Bayesian model, BayesANIL, that is capable of estimating uncertainties associated with the labeling process. Given a labeled or partially labeled training corpus of text documents, the model estimates the joint distribution of training documents and class labels by using a generalization of the Expectation Maximization algorithm. The estimates can be used in standard classification models to reduce error rates. Since uncertainties in the labeling are taken into account, the model provides an elegant mechanism to deal with noisy labels. We provide an intuitive modification to the EM iterations by re-estimating the empirical distribution in order to reinforce feature values in unlabeled data and to reduce the influence of noisily labeled examples. Considerable improvement in the classification accuracies of two popular classification algorithms on standard labeled data-sets with and without artificially introduced noise, as well as in the presence and absence of unlabeled data, indicates that this may be a promising method to reduce the burden of manual labeling." } @InProceedings{Liu:2004:TCL, author = "Bing Liu and Xiaoli Li and Wee Sun Lee and Yu, Philip S.", title = "Text Classification by Labeling Words", booktitle = "Proceedings of the Eighteenth National Conference on Artificial Intelligence", year = "2005", month = "July", address = "San Jose, CA", publisher = "AAAI Press", abstract = "Traditionally, text classifiers are built from labeled training examples. Labeling is usually done manually by human experts (or the users), which is a labor intensive and time consuming process. In the past few years, researchers investigated various forms of semi-supervised learning to reduce the burden of manual labeling. In this paper, we propose a different approach. Instead of labeling a set of documents, the proposed method labels a set of representative words for each class. It then uses these words to extract a set of documents for each class from a set of unlabeled documents to form the initial training set. The EM algorithm is then applied to build the classifier. The key issue of the approach is how to obtain a set of representative words for each class. One way is to ask the user to provide them, which is difficult because the user usually can only give a few words (which are insufficient for accurate learning). We propose a method to solve the problem. It combines clustering and feature selection. The technique can effectively rank the words in the unlabeled set according to their importance. The user then selects/labels some words from the ranked list for each class. This process requires less effort than providing words with no help or manual labeling of documents. Our results show that the new method is highly effective and promising." } @article{Bang:2006:HDC, author = {S.L. Bang and J.D. Yang and H.J. Yang}, title = {Hierarchical document categorization with k-NN and concept-based thesauri}, journal = {Information Processing and Management}, year = {2006}, volume = {42}, number = {2}, pages = {387--406}, abstract = {In this paper, we propose a new algorithm, which incorporates the relationships of concept-based thesauri into the document categorization using the k-NN classifier (k-NN). k-NN is one of the most popular document categorization methods because it shows relatively good performance in spite of its simplicity. However, it significantly degrades precision when ambiguity arises, i.e., when there exist more than one candidate category to which a document can be assigned. To remedy the drawback, we employ concept-based thesauri in the categorization. Employing the thesaurus entails structuring categories into hierarchies, since their structure needs to be conformed to that of the thesaurus for capturing relationships between categories. By referencing various relationships in the thesaurus corresponding to the structured categories, k-NN can be prominently improved, removing the ambiguity. In this paper, we first perform the document categorization by using k-NN and then employ the relationships to reduce the ambiguity. Experimental results show that this method improves the precision of k-NN up to 13.86% without compromising its recall.} } @article{Lee:2006:IGD, author = {Lee, Changki and Lee, Gary Geunbae}, title = {Information gain and divergence-based feature selection for machine learning-based text categorization}, journal = {Information Processing and Management}, year = {2006}, volume = {42}, number = {1}, pages = {155--165}, abstract = {Most previous works of feature selection emphasized only the reduction of high dimensionality of the feature space. But in cases where many features are highly redundant with each other, we must utilize other means, for example, more complex dependence models such as Bayesian network classifiers. In this paper, we introduce a new information gain and divergence-based feature selection method for statistical machine learning-based text categorization without relying on more complex dependence models. Our feature selection method strives to reduce redundancy between features while maintaining information gain in selecting appropriate features for text categorization. Empirical results are given on a number of dataset, showing that our feature selection method is more effective than Koller and Sahami’s method [Koller, D., & Sahami, M. (1996). Toward optimal feature selection. In Proceedings of ICML-96, 13th international conference on machine learning], which is one of greedy feature selection methods, and conventional information gain which is commonly used in feature selection for text categorization. Moreover, our feature selection method sometimes produces more improvements of conventional machine learning algorithms over support vector machines which are known to give the best classification accuracy.} } @article{Pant:2005:LCC, author = {Gautam Pant and Padmini Srinivasan}, title = {Learning to crawl: Comparing classification schemes}, journal = {ACM Transactions on Information Systems}, year = {2005}, volume = {23}, number = {4}, pages = {430--462}, url = {http://doi.acm.org/10.1145/1095872.1095875}, abstract = {Topical crawling is a young and creative area of research that holds the promise of benefiting from several sophisticated data mining techniques. The use of classification algorithms to guide topical crawlers has been sporadically suggested in the literature. No systematic study, however, has been done on their relative merits. Using the lessons learned from our previous crawler evaluation studies, we experiment with multiple versions of different classification schemes. The crawling process is modeled as a parallel best-first search over a graph defined by the Web. The classifiers provide heuristics to the crawler thus biasing it towards certain portions of the Web graph. Our results show that Naive Bayes is a weak choice for guiding a topical crawler when compared with Support Vector Machine or Neural Network. Further, the weak performance of Naive Bayes can be partly explained by extreme skewness of posterior probabilities generated by it. We also observe that despite similar performances, different topical crawlers cover subspaces on the Web with low overlap.} } @article{Peng:2004:ANB, author = {Fuchun Peng and Dale Schuurmans and Shaojun Wang}, title = {Augmenting Naive {B}ayes Classifiers with Statistical Language Models}, journal = {Information Retrieval}, publisher = {Springer Science}, year = {2004}, volume = {7}, number = {3-4}, pages = {317--345}, abstract = {We augment naive Bayes models with statistical n-gram language models to address short-comings of the standard naive Bayes text classifier. The result is a generalized naive Bayes classifier which allows for a local Markov dependence among observations; a model we refer to as the C hain A ugmented N aive Bayes (CAN) Bayes classifier. CAN models have two advantages over standard naive Bayes classifiers. First, they relax some of the independence assumptions of naive Bayes—allowing a local Markov chain dependence in the observed variables—while still permitting efficient inference and learning. Second, they permit straightforward application of sophisticated smoothing techniques from statistical language modeling, which allows one to obtain better parameter estimates than the standard Laplace smoothing used in naive Bayes classification. In this paper, we introduce CAN models and apply them to various text classification problems. To demonstrate the language independent and task independent nature of these classifiers, we present experimental results on several text classification problems—authorship attribution, text genre classification, and topic detection—in several languages—Greek, English, Japanese and Chinese. We then systematically study the key factors in the CAN model that can influence the classification performance, and analyze the strengths and weaknesses of the model.} } @article{Makkonen:2004:SST, author = {Juha Makkonen and Helena Ahonen-Myka and Marko Salmenkivi}, title = {Simple Semantics in Topic Detection and Tracking}, journal = {Information Retrieval}, publisher = {Springer Science}, year = {2004}, volume = {7}, number = {3-4}, pages = {347--368}, abstract = {Topic Detection and Tracking (TDT) is a research initiative that aims at techniques to organize news documents in terms of news events. We propose a method that incorporates simple semantics into TDT by splitting the term space into groups of terms that have the meaning of the same type. Such a group can be associated with an external ontology. This ontology is used to determine the similarity of two terms in the given group. We extract proper names, locations, temporal expressions and normal terms into distinct sub-vectors of the document representation. Measuring the similarity of two documents is conducted by comparing a pair of their corresponding sub-vectors at a time. We use a simple perceptron to optimize the relative emphasis of each semantic class in the tracking and detection decisions. The results suggest that the spatial and the temporal similarity measures need to be improved. Especially the vagueness of spatial and temporal terms needs to be addressed.} } @article{Kazama:2004:MEM, author = {Junichi Kazama and Junichi Tsujii}, title = {Maximum Entropy Models with Inequality Constraints: A Case Study on Text Categorization}, journal = {Machine Learning}, publisher = {Springer Science}, year = {2005}, volume = {60}, number = {1-3}, pages = {159--194}, abstract = {Data sparseness or overfitting is a serious problem in natural language processing employing machine learning methods. This is still true even for the maximum entropy (ME) method, whose flexible modeling capability has alleviated data sparseness more successfully than the other probabilistic models in many NLP tasks. Although we usually estimate the model so that it completely satisfies the equality constraints on feature expectations with the ME method, complete satisfaction leads to undesirable overfitting, especially for sparse features, since the constraints derived from a limited amount of training data are always uncertain. To control overfitting in ME estimation, we propose the use of box-type inequality constraints, where equality can be violated up to certain predefined levels that reflect this uncertainty. The derived models, inequality ME models, in effect have regularized estimation with L1 norm penalties of bounded parameters. Most importantly, this regularized estimation enables the model parameters to become sparse. This can be thought of as automatic feature selection, which is expected to improve generalization performance further. We evaluate the inequality ME models on text categorization datasets, and demonstrate their advantages over standard ME estimation, similarly motivated Gaussian MAP estimation of ME models, and support vector machines (SVMs), which are one of the state-of-the-art methods for text categorization.} } @article{Kim:2005:DRT, author = {Hyunsoo Kim and Peg Howland and Haesun Park}, title = {Dimension Reduction in Text Classification with Support Vector Machines}, journal = {Journal of Machine Learning Research}, year = {2005}, volume = {6}, pages = {37--53}, url = {http://www.jmlr.org/papers/volume6/kim05a/kim05a.pdf}, abstract = {Support vector machines (SVMs) have been recognized as one of the most successful classification methods for many applications including text classification. Even though the learning ability and computational complexity of training in support vector machines may be independent of the dimension of the feature space, reducing computational complexity is an essential issue to efficiently handle a large number of terms in practical applications of text classification. In this paper, we adopt novel dimension reduction methods to reduce the dimension of the document vectors dramatically. We also introduce decision functions for the centroid-based classification algorithm and support vector classifiers to handle the classification problem where a document may belong to multiple classes. Our substantial experimental results show that with several dimension reduction methods that are designed particularly for clustered data, higher efficiency for both training and testing can be achieved without sacrificing prediction accuracy of text classification even when the dimension of the input space is significantly reduced.} } @article{Yang:2005:ACT, author = {Hsin-Chang Yang and Chung-Hong Lee}, title = {Automatic Category Theme Identification and Hierarchy Generation for Chinese Text Categorization}, journal = {Journal of Intelligent Information Systems}, year = {2005}, volume = {25}, number = {1}, pages = {47--67}, abstract = {Recently research on text mining has attracted lots of attention from both industrial and academic fields. Text mining concerns of discovering unknown patterns or knowledge from a large text repository. The problem is not easy to tackle due to the semi-structured or even unstructured nature of those texts under consideration. Many approaches have been devised for mining various kinds of knowledge from texts. One important aspect of text mining is on automatic text categorization, which assigns a text document to some predefined category if the document falls into the theme of the category. Traditionally the categories are arranged in hierarchical manner to achieve effective searching and indexing as well as easy comprehension for human beings. The determination of category themes and their hierarchical structures were most done by human experts. In this work, we developed an approach to automatically generate category themes and reveal the hierarchical structure among them. We also used the generated structure to categorize text documents. The document collection was trained by a self-organizing map to form two feature maps. These maps were then analyzed to obtain the category themes and their structure. Although the test corpus contains documents written in Chinese, the proposed approach can be applied to documents written in any language and such documents can be transformed into a list of separated terms.} } @inProceedings{Esuli:2005:DSO, author = "Andrea Esuli and Fabrizio Sebastiani", title = "Determining the semantic orientation of terms through gloss classification", pages = "617--624", booktitle = "Proceedings of the 14th {ACM} international Conference on Information and Knowledge Management", year = "2005", month = "November", address = "Bremen, Germany", publisher = "ACM Press", url = "http://doi.acm.org/10.1145/1099554.1099713", abstract = "Sentiment classification is a recent subdiscipline classification which is concerned not with the topic is about, but with the opinion it expresses. It of applications, ranging from tracking usersÕ products or about political candidates as expressed forums, to customer relationship management. to the extraction of opinions from text is the of the orientation of subjective terms contained i.e. the determination of whether a term that opinionated content has a positive or a negative connotation. this paper we present a new method for determining orientation of subjective terms. The method is quantitative analysis of the glosses of such definitions that these terms are given in on- dictionaries, and on the use of the resulting term representations semi-supervised term classification. The method outperforms all known methods when tested recognized standard benchmarks for this task." } @InProceedings{Ghamrawi:2005:CMC, author = "Nadia Ghamrawi and Andrew McCallum", title = "Collective multi-label classification", booktitle = "Proceedings of the 14th {ACM} international Conference on Information and Knowledge Management", year = "2005", month = "November", address = "Bremen, Germany", publisher = "ACM Press", url = "http://doi.acm.org/10.1145/1099554.1099591", abstract = "Common approaches to multi-label classification learn independent classifiers for each category, and employ ranking or thresholding schemes for classification. Because they do not exploit dependencies between labels, such techniques are only well-suited to problems in which categories are independent. However, in many domains labels are highly interdependent. This paper explores multi-label conditional random field (CRF)classification models that directly parameterize label co-occurrences in multi-label classification. Experiments show that the models outperform their single-label counterparts on standard text corpora. Even when multi-labels are sparse, the models improve subset classification error by as much as 40%." } @InProceedings{Tan:2005:NRA, author = "Songbo Tan and Xueqi Cheng and Moustafa M. Ghanem and Bin Wang and Hongbo Xu", title = "A novel refinement approach for text categorization", booktitle = "Proceedings of the 14th {ACM} international Conference on Information and Knowledge Management", year = "2005", month = "November", address = "Bremen, Germany", publisher = "ACM Press", url = "http://doi.acm.org/10.1145/1099554.1099687", abstract = "In this paper we present a novel strategy, DragPushing, for improving the performance of text classifiers. The strategy is generic and takes advantage of training errors to successively refine the classification model of a base classifier. We describe how it is applied to generate two new classification algorithms; a Refined Centroid Classifier and a Refined Naïve Bayes Classifier. We present an extensive experimental evaluation of both algorithms on three English collections and one Chinese corpus. The results indicate that in each case, the refined classifiers achieve significant performance improvement over the base classifiers used. Furthermore, the performance of the Refined Centroid Classifier implemented is comparable, if not better, to that of state-of-the-art support vector machine (SVM)-based classifier, but offers a much lower computational cost." } @InProceedings{Zhang:2005:IGF, author = "Baoping Zhang and Yuxin Chen and Weiguo Fan and Edward A. Fox and Marcos Goncalves and Marco Cristo and Pavel Calado", title = "Intelligent {GP} fusion from multiple sources for text classification", booktitle = "Proceedings of the 14th {ACM} international Conference on Information and Knowledge Management", year = "2005", month = "November", address = "Bremen, Germany", publisher = "ACM Press", url = "http://doi.acm.org/10.1145/1099554.1099688", abstract = "This paper shows how citation-based information and structural content (e.g., title, abstract) can be combined to improve classification of text documents into predefined categories. We evaluate different measures of similarity -- five derived from the citation information of the collection, and three derived from the structural content -- and determine how they can be fused to improve classification effectiveness. To discover the best fusion framework, we apply Genetic Programming (GP) techniques. Our experiments with the ACM Computing Classification Scheme, using documents from the ACM Digital Library, indicate that GP can discover similarity functions superior to those based solely on a single type of evidence. Effectiveness of the similarity functions discovered through simple majority voting is better than that of content-based as well as combination-based Support Vector Machine classifiers. Experiments also were conducted to compare the performance between GP techniques and other fusion techniques such as Genetic Algorithms (GA) and linear fusion. Empirical results show that GP was able to discover better similarity functions than GA or other fusion techniques." } @InProceedings{Li:2005:NDB, author = "Xiaoyan Li and Croft, W. Bruce", title = "Novelty detection based on sentence level patterns", pages = "744--751", booktitle = "Proceedings of the 14th {ACM} international Conference on Information and Knowledge Management", year = "2005", month = "November", address = "Bremen, Germany", publisher = "ACM Press", url = "http://doi.acm.org/10.1145/1099554.1099734", abstract = "The detection of new information in a document stream is an important component of many potential applications. In this paper, a new novelty detection approach based on the identification of sentence level patterns is proposed. Given a user's information need, some patterns in sentences such as combinations of query words, named entities and phrases, may contain more important and relevant information than single words. Therefore, the proposed novelty detection approach focuses on the identification of previously unseen query-related patterns in sentences. Specifically, a query is preprocessed and represented with patterns that include both query words and required answer types. These patterns are used to retrieve sentences, which are then determined to be novel if it is likely that a new answer is present. An analysis of patterns in sentences was performed with data from the TREC 2002 novelty track and experiments on novelty detection were carried out on data from the TREC 2003 and 2004 novelty tracks. The experimental results show that the proposed pattern-based approach significantly outperforms all three baselines in terms of precision at top ranks." } @InProceedings{Whitelaw:2005:UAG, author = "Casey Whitelaw and Navendu Garg and Shlomo Argamon", title = "Using appraisal groups for sentiment analysis", pages = "625--631", booktitle = "Proceedings of the 14th {ACM} international Conference on Information and Knowledge Management", year = "2005", month = "November", address = "Bremen, Germany", publisher = "ACM Press", url = "http://doi.acm.org/10.1145/1099554.1099714", abstract = "Little work to date in sentiment analysis (classifying texts by `positive' or `negative' orientation) has attempted to use fine-grained semantic distinctions in features used for classification. We present a new method for sentiment classification based on extracting and analyzing appraisal groups such as ``very good'' or ``not terribly funny''. An appraisal group is represented as a set of attribute values in several task-independent semantic taxonomies, based on Appraisal Theory. Semi-automated methods were used to build a lexicon of appraising adjectives and their modifiers. We classify movie reviews using features based upon these taxonomies combined with standard ``bag-of-words'' features, and report state-of-the-art accuracy of 90.2%. In addition, we find that some types of appraisal appear to be more significant for sentiment classification than others." } @InProceedings{Boese:2005:EWD, author = "Boese, Elizabeth Sugar and Howe, Adele E.", title = "Effects of web document evolution on genre classification", pages = "632--639", booktitle = "Proceedings of the 14th {ACM} international Conference on Information and Knowledge Management", year = "2005", month = "November", address = "Bremen, Germany", publisher = "ACM Press", url = "http://doi.acm.org/10.1145/1099554.1099715", abstract = "The World Wide Web is a massive corpus that constantly evolves. Classification experiments usually grab a snapshot (temporally and spatially) of the Web for a corpus. In this paper, we examine the effects of page evolution on genre classification of Web pages. Web genre refers to the type of the page characterized by features such as style, form or presentation layout, and meta-content; Web genre can be used to tune spider crawling re-visits and inform relevance judgments for search engines. We found that pages in some genres change rarely if at all and can be used in present-day research experiments without requiring an updated version. We show that an old corpus can be used for training when testing on new Web pages, with only a marginal drop in accuracy rates on genre classification. We also show that features found to be useful in one corpus do not transfer well to other corpora with different genres." } @Article{Vogel:2005:KDDCUP2005, author = "David Vogel and Steffen Bickel and Peter Haider and Rolf Schimpfky and Peter Siemen and Steve Bridges and Tobias Scheffer", title = "Classifying Search Engine Queries Using the Web as Background Knowledge", journal = "{SIGKDD} Explorations", pages = "117--122", volume = 7, number = 2, year = 2005, url = "http://www.acm.org/sigs/sigkdd/explorations/issues/7-2-2005-12/KDDCUP2005Report_Scheffer.pdf", abstract = "The performance of search engines crucially depends on their ability to capture the meaning of a query most likely intended by the user. We study the problem of mapping a search engine query to those nodes of a given subject taxonomy that characterize its most likely meanings. We describe the architecture of a classification system that uses a web directory to identify the subject context that the query terms are frequently used in. Based on its performance on the classification of 800,000 example queries recorded from MSN search, the system received the Runner-Up Award for Query Categorization Performance of the KDD Cup 2005." } @Article{Li:2005:KDDCUP2005Report, author = "Li, Ying and Zheng, Zijian and Dai, Honghua (Kathy)", title = "KDD CUP-2005 Report: Facing a Great Challenge", journal = "{SIGKDD} Explorations", pages = "91--99", volume = 7, number = 2, year = 2005, url = "http://www.acm.org/sigs/sigkdd/explorations/issues/7-2-2005-12/KDDCUP2005Report_organizers.pdf", abstract = "The KDD-Cup 2005 Competition was held in conjunction with the Eleventh ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. The task of the KDDCup 2005 competition was to classify 800,000 internet user search queries into 67 predefined categories. This task is easy to understand, but the lack of straightforward training set, subjective user intents of queries, poor information in short queries, and high noise level make the task very challenge. In this paper, we summarize the competition task, the evaluation method, and the results of the competition. Here we only highlight some key techniques used in submitted solutions. The technical details of the solutions from the three award winning teams are available in their papers separately in this issue of SIGKDD Explorations. At the end, we also share the results of a survey conducted with this year’s Cup participants. To facilitate research in this area, the task description, data, answer set, and related information of this KDD-Cup are published at the KDDCup 2005 web site: http://www.acm.org/sigs/sigkdd/kdd2005/kddcup.html." } @Article{Shen:2005:Q2C, author = "Shen, D. and Pan, R. and Sun, J.-T. and Pan, J. J. and Wu, K. and Yin, J. and Yang, Q.", title = "{Q2C@UST}: Our Winning Solution to Query Classification in KDDCUP 2005", journal = "{SIGKDD} Explorations", pages = "100--110", volume = 7, number = 2, year = 2005, url = "http://www.acm.org/sigs/sigkdd/explorations/issues/7-2-2005-12/KDDCUP2005Report_Shen.pdf", abstract = "In this paper, we describe our ensemble-search based approach, Q2C@UST (http://webproject1.cs.ust.hk/q2c/), for the query classification task for the KDDCUP 2005. There are two aspects to the key difficulties of this problem: one is that the meaning of the queries and the semantics of the predefined categories are hard to determine. The other is that there are no training data for this classification problem. We apply a two-phase framework to tackle the above difficulties. Phase I corresponds to the training phase of machine learning research and phase II corresponds to testing phase. In phase I, two kinds of classifiers are developed as the base classifiers. One is synonym-based and the other is statistics based. Phase II consists of two stages. In the first stage, the queries are enriched such that for each query, its related Web pages together with their category information are collected through the use of search engines. In the second stage, the enriched queries are classified through the base classifiers trained in phase I. Based on the classification results obtained by the base classifiers, two ensemble classifiers based on two different strategies are proposed. The experimental results on the validation dataset help confirm our conjectures on the performance of the Q2C@UST system. In addition, the evaluation results given by the KDDCUP 2005 organizer confirm the effectiveness of our proposed approaches. The best F1 value of our two solutions is 9.6% higher than the best of all other participants’ solutions. The average F1 value of our two submitted solutions is 94.4% higher than the average F1 value from all other submitted solutions." } @Article{Kardcovacs:2005:Ferrety, author = "Kardkovacs, Z. T. and Tikk, D. and Bansaghi, Z.", title = "The Ferrety Algorithm for the {KDD} Cup 2005 Problem", journal = "{SIGKDD} Explorations", pages = "111--116", volume = 7, number = 2, year = 2005, url = "http://www.acm.org/sigs/sigkdd/explorations/issues/7-2-2005-12/KDDCUP2005Report_Ferrety.pdf", abstract = "In this paper, we present a general solution for the KDD Cup 2005 problem. It uses the Internet as source of knowledge and extends it to categorize very short (less than 5 words) documents with reasonable accuracy. Our approach consists of three main parts: i.) a central knowledge filter ii.) an on-demand web crawler and iii.) a very efficient categorizer system. Our solution obtained Creativity and Precision Runner-up Awards at the competition. The main idea of Ferrety Algorithm can be generalized for mapping one taxonomy to another if training documents are available." } @InProceedings{Montejo-Raez:2005:TCU, author = "Montejo-Raez, Arturo and Urena-Lopez, L. Alfonso and Steinberger, Ralf", title = "Text categorization using bibliographic records: beyond document content", booktitle = "Proceedings of the 21st Conference of the Spanish Society for Natural Language Processing (SEPLN'2005)", year = "2005", month = "September", address = "Granada, Spain", url = "http://www.jrc.cec.eu.int/langtech/Documents/0509_SEPLN-05_Montejo-et-al.pdf", abstract = "This paper studies the use of di®erent sources of information for performing a text classification task. The growing number of digital libraries imposes a review of the available data from those databases. Some experiments applying different base classifiers for a multi-label classifier in the domain of High Energy Physics on several of these possible sources have been carried out. Results show that the use of metadata is almost as good as the full-text version of papers. Keywords: text categorization, machine learning, digital libraries." } @InProceedings{Tailby:2006:ECA, author = "Tailby, Ross and Dean, Richard and Milner, Ben and Smith, Dan", title = "Email classification for automated service handling", booktitle = "SAC'06", year = "2006", month = "April", address = "Dijon, France", abstract = "We describe the experience and lessons learned from developing a range of electronic services for a specialist engineering company. We are using a custom workflow management system as the base for a range of services which are offered via a multimodal portal, using a language-based approach to extracting information from HTML forms, email, and SMS. We describe the email classification experiments we have carried out and discuss the development of customer services based on automatic email classification." } @InProceedings{Wu:2004:IPK, author = "Xiaoyun Wu and Rohini Srihari", title = "Incorporating Prior Knowledge with Weighted Margin Support Vector Machines", booktitle = "KDD'04", year = "2004", pages = "326-333", month = "August", address = "Seattle, Washington", abstract = "Like many purely data-driven machine learning methods, Support Vector Machine (SVM) classifiers are learned exclusively from the evidence presented in the training dataset; thus a larger training dataset is required for better performance. In some applications, there might be human knowledge available that, in principle, could compensate for the lack of data. In this paper, we propose a simple generalization of SVM: Weighted Margin SVM (WMSVMs) that permits the incorporation of prior knowledge. We show that Sequential Minimal Optimization can be used in training WMSVM. We discuss the issues of incorporating prior knowledge using this rather general formulation. The experimental results show that the proposed methods of incorporating prior knowledge is effective." } @InProceedings{Pekar:2004:CWP, author = "Viktor Pekar, Richard Evans and Ruslan Mitkov", title = "Categorizing Web Pages as a Preprocessing Step for Information Extraction", booktitle = "LREC'04", year = "2004" } @InProceedings{Hahn:2004:PDT, author = "Udo Hahn and Joachim Wermter", title = "Pumping Documents Through a Domain and Genre Classification Pipeline", booktitle = "LREC'04", year = "2004" } @InProceedings{Cohen:2004:LCE, author = "Cohen, William W. and Carvalho, Vitor R. and Mitchell, Tom M.", title = "Learning to Classify Email into ``Speech Acts''", booktitle = "EMNLP'04", pages = "309--316", year = "2004", month = "July", address = "Barcelona, Spain", publisher = "Association for Computational Linguistics", abstract = "It is often useful to classify email according to the intent of the sender (e.g., 'propose a meeting', 'deliver information'). We present experimental results in learning to classify email in this fashion, where each class corresponds to a verb-noun pair taken from a predefined ontology describing typical 'email speech acts'. We demonstrate that, although this categorization problem is quite different from 'topical' text classification, certain categories of messages can nonetheless be detected with high precision (above 80%) and reasonable recall (above 50%) using existing text-classification learning methods. This result suggests that useful task-tracking tools could be constructed based on automatic classification into this taxonomy." } @InProceedings{Alm:2005:EFT, author = "Alm, Cecilia Ovesdotter and Roth, Dan and Sproat, Richard", title = "Emotions from text: machine learning for text-based emotion prediction", booktitle = "EMNLP'05", year = "2005" } @InProceedings{Schiffman:2005:CLN, author = "Schiffman, Barry and McKeown, Kathleen R.", title = "Context and Learning in Novelty Detection", booktitle = "EMNLP'05", year = "2005", abstract = "We demonstrate the value of using context in a new-information detection system that achieved the highest precision scores at the Text Retrieval Conference's Novelty Track in 2004. In order to determine whether information within a sentence has been seen in material read previously, our system integrates information about the context of the sentence with novel words and named entities within the sentence, and uses a specialized learning algorithm to tune the system parameters." } @InProceedings{Sahlgren:2004:UBC, author = "Schiffman, Barry and McKeown, Kathleen R.", title = "Context and Learning in Novelty Detection", booktitle = "COLING'04", pages = "487--493", year = "2004", abstract = "We demonstrate the value of using context in a new-information detection system that achieved the highest precision scores at the Text Retrieval Conference's Novelty Track in 2004. In order to determine whether information within a sentence has been seen in material read previously, our system integrates information about the context of the sentence with novel words and named entities within the sentence, and uses a specialized learning algorithm to tune the system parameters." } @InProceedings{Gamon:2004:SCC, author = "Gamon, Michael", title = "Sentiment classification on customer feedback data: noisy data, large feature vectors, and the role of linguistic analysis", booktitle = "Proceedings of Coling 2004", year = 2004, month = "August", address = "Geneva, Switzerland", publisher = "COLING", pages = "841--847", url = "http://acl.ldc.upenn.edu/coling2004/MAIN/pdf/121-637.pdf", abstract = "We demonstrate that it is possible to perform automatic sentiment classification in the very noisy domain of customer feedback data. We show that by using large feature vectors in combination with feature reduction, we can train linear support vector machines that achieve high classification accuracy on data that present classification challenges even for a human annotator. We also show that, surprisingly, the addition of deep linguistic analysis features to a set of surface level word n-gram features contributes consistently to classification accuracy in this domain." } @article{Bianchi:2006:IAH, author = {Cesa-Bianchi, Nicolo and Gentile, Claudio and Zaniboni, Luca}, title = {Incremental Algorithms for Hierarchical Classification}, journal = {Journal of Machine Learning Research}, volume = {7}, pages = {31--54}, year = {2006}, url = {http://jmlr.csail.mit.edu/papers/volume7/cesa-bianchi06a/cesa-bianchi06a.pdf}, abstract = {We study the problem of classifying data in a given taxonomy when classifications associated with multiple and/or partial paths are allowed. We introduce a new algorithm that incrementally learns a linear-threshold classifier for each node of the taxonomy. A hierarchical classification is obtained by evaluating the trained node classifiers in a top-down fashion. To evaluate classifiers in our multipath framework, we define a new hierarchical loss function, the H-loss, capturing the intuition that whenever a classification mistake is made on a node of the taxonomy, then no loss should be charged for any additional mistake occurring in the subtree of that node. Making no assumptions on the mechanism generating the data instances, and assuming a linear noise model for the labels, we bound the H-loss of our on-line algorithm in terms of the H-loss of a reference classifier knowing the true parameters of the label-generating process. We show that, in expectation, the excess cumulative H-loss grows at most logarithmically in the length of the data sequence. Furthermore, our analysis reveals the precise dependence of the rate of convergence on the eigenstructure of the data each node observes. Our theoretical results are complemented by a number of experiments on texual corpora. In these experiments we show that, after only one epoch of training, our algorithm performs much better than Perceptron-based hierarchical classifiers, and reasonably close to a hierarchical support vector machine.} } @article{Diaz04, author = {Irene D{\'{\i}}az and Jos{\'{e}} Ranilla and Elena Monta{\~{n}}es and Javier Fern{\'{a}}ndez and El{\'{\i}}as F. Combarro}, title = {Improving Performance of Text Categorization by Combining Filtering and Support Vector Machines}, journal = {Journal of the American Society for Information Science and Technology}, year = {2004}, volume = {55}, number = {7}, pages = {578--592}, url = {http://dx.doi.org/10.1002/asi.10409}, abstract = {Text Categorization is the process of assigning documents to a set of previously fixed categories. A lot of research is going on with the goal of automating this time-consuming task. Several different algorithms have been applied, and Support Vector Machines (SVM) have shown very good results. In this report, we try to prove that a previous filtering of the words used by SVM in the classification can improve the overall performance. This hypothesis is systematically tested with three different measures of word relevance, on two different corpus (one of them considered in three different splits), and with both local and global vocabularies. The results show that filtering significantly improves the recall of the method, and that also has the effect of significantly improving the overall performance.} } @Article{CombarroMDRM05, title = "Introducing a Family of Linear Measures for Feature Selection in Text Categorization", author = "El{\'i}as F. Combarro and Elena Monta{\~n}{\'e}s and Irene D{\'i}az and Jos{\'e} Ranilla and Ricardo Mones", journal = "IEEE Trans. Knowl. Data Eng", year = "2005", number = "9", volume = "17", pages = "1223--1232", URL = "http://doi.ieeecomputersociety.org/10.1109/TKDE.2005.149", abstract = "Text Categorization, which consists of automatically assigning documents to a set of categories, usually involves the management of a huge number of features. Most of them are irrelevant and others introduce noise which could mislead the classifiers. Thus, feature reduction is often performed in order to increase the efficiency and effectiveness of the classification. In this paper, we propose to select relevant features by means of a family of linear filtering measures which are simpler than the usual measures applied for this purpose. We carry out experiments over two different corpora and find that the proposed measures perform better than the existing ones." } @article{MontanesDRCF05, title = "Scoring and Selecting Terms for Text Categorization", author = "Elena Monta{\~n}{\'e}s and Irene D{\'i}az and Jos{\'e} Ranilla and El{\'i}as F. Combarro and Javier Fern{\'a}ndez", journal = "IEEE Intelligent Systems", year = "2005", number = "3", volume = "20", pages = "40--47", url = "http://doi.ieeecomputersociety.org/10.1109/MIS.2005.49", abstract = "Machine learning has become one of the main approaches to tackling text categorization. Because text domains present much irrelevant information, effective feature reduction is essential to improve classifiers' effectiveness and efficiency. A set of new scoring measures for feature selection taken from the machine learning domain were evaluated over two well-known collections of documents. Some of these measures outperformed traditional measures from information retrieval and information theory in certain situations." } @ARTICLE{kyber03, author = "D. Tikk and J. D. Yang and S. L. Bang", title = "Hierarchical text categorization using fuzzy relational thesaurus", journal = "Kybernetika", year = 2003, volume = 39, number = 5, pages = "583--600" } @ARTICLE{ajiips04a, author = "D. Tikk and {Gy.} Biro and J. D. Yang", title = "A hierarchical text categorization approach and its application to {FRT} expansion", journal = "Australian Journal of Intelligent Information Processing Systems", volume = 8, number = 3, year = 2004, pages = "123--131", issn = "1321-2133" } @INCOLLECTION{ISUMAbook05, author = "D. Tikk and {Gy.} Biro and J. D. Yang", title = "Experiments with a Hierarchical Text Categorization Method on {WIPO} Patent Collections", booktitle = "Applied Research in Uncertainty Modelling and Analysis", editor = "N. O. Attok-Okine and B. M. Ayyub", series = "International Series in Intelligent Technologies", number = 20, isbn = "ISBN 0-387-23535-3", publisher = "Springer", year = 2005, pages = "283--302" } @INPROCEEDINGS{ISUMA03, author = "D. Tikk and {Gy.} Biro", title = "Experiment with a hierarchical text categorization method on the {WIPO} patent collection", booktitle = "Proc. of the 4th Int. Symp. on Uncertainty Modeling and Analysis (ISUMA'03)", page = "104--109", address = "University of Maryland, USA", year = "September 21--24, 2003" } @INPROCEEDINGS{ICCC03, author = "D. Tikk and {Gy.} Biro", title = "Experiments with multilabel text classifier on the {R}euters collection", booktitle = "Proc. of the IEEE Int. Conf. on Computational Cybernetics (ICCC03)", address = "Siofok, Hungary", year = "August 29--31, 2003", pages = "33--38", isbn = "ISBN 963-7154-17-5", } @InProceedings{Gabrilovich:2006:Wikipedia, author = "Gabrilovich, Evgeniy and Markovitch, Shaul", title = "Overcoming the Brittleness Bottleneck using {W}ikipedia: Enhancing Text Categorization with Encyclopedic Knowledge", booktitle1 = "AAAI'06", booktitle = "Proceedings of the 21st National Conference on Artificial Intelligence", year = "2006", month = "July", address1 = "Boston, MA", pages = "1301--1306" } @InProceedings{Lan:2006:PNT, author = "Lan, Man and Tan, Chew-Lim and Low, Hwee-Boon", title = "Proposing a New Term Weighting Scheme for Text Categorization", booktitle1 = "AAAI'06", booktitle = "Proceedings of the 21st National Conference on Artificial Intelligence", year = "2006", month = "July", address1 = "Boston, MA", pages = "763--768" } @article{Raghavan:2006:ALF, author = {Raghavan, Hema and Madani, Omid and Jones, Rosie}, title = {Active Learning with Feedback on Features and Instances}, journal = {Journal of Machine Learning Research}, volume = {7}, pages = {1655--1686}, year = {2006}, abstract = {We extend the traditional active learning framework to include feedback on features in addition to labeling instances, and we execute a careful study of the effects of feature selection and human feedback on features in the setting of text categorization. Our experiments on a variety of categorization tasks indicate that there is significant potential in improving classifier performance by feature re-weighting, beyond that achieved via membership queries alone (traditional active learning) if we have access to an oracle that can point to the important (most predictive) features. Our experiments on human subjects indicate that human feedback on feature relevance can identify a sufficient proportion of the most relevant features (over 50% in our experiments). We find that on average, labeling a feature takes much less time than labeling a document. We devise an algorithm that interleaves labeling features and documents which significantly accelerates standard active learning in our simulation experiments. Feature feedback can complement traditional active learning in applications such as news filtering, e-mail classification, and personalization, where the human teacher can have significant knowledge on the relevance of features.}, } @InProceedings{Forman:2006:TCD, author = "Forman, George", title = "Tackling concept drift by temporal inductive transfer", booktitle = "Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval", year = "2006", pages = "252--259", abstract = "Machine learning is the mainstay for text classification. However, even the most successful techniques are defeated by many real-world applications that have a strong time-varying component. To advance research on this challenging but important problem, we promote a natural, experimental framework-the Daily Classification Task-which can be applied to large time-based datasets, such as Reuters RCV1. In this paper we dissect concept drift into three main subtypes. We demonstrate via a novel visualization that the recurrent themes subtype is present in RCV1. This understanding led us to develop a new learning model that transfers induced knowledge through time to benefit future classifier learning tasks. The method avoids two main problems with existing work in inductive transfer: scalability and the risk of negative transfer. In empirical tests, it consistently showed more than 10 points F-measure improvement for each of four Reuters categories tested." } @InProceedings{Sindhwani:2006:LSS, author = "Sindhwani, Vikas and Keerthi, S. Sathiya", title = "Large scale semi-supervised linear SVMs", booktitle = "Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval", year = "2006", pages = "477--484", abstract = "Large scale learning is often realistic only in a semi-supervised setting where a small set of labeled examples is available together with a large collection of unlabeled data. In many information retrieval and data mining applications, linear classifiers are strongly preferred because of their ease of implementation, interpretability and empirical performance. In this work, we present a family of semi-supervised linear support vector classifiers that are designed to handle partially-labeled sparse datasets with possibly very large number of examples and features. At their core, our algorithms employ recently developed modified finite Newton techniques. Our contributions in this paper are as follows: (a) We provide an implementation of Transductive SVM (TSVM) that is significantly more efficient and scalable than currently used dual techniques, for linear classification problems involving large, sparse datasets. (b) We propose a variant of TSVM that involves multiple switching of labels. Experimental results show that this variant provides an order of magnitude further improvement in training efficiency. (c) We present a new algorithm for semi-supervised learning based on a Deterministic Annealing (DA) approach. This algorithm alleviates the problem of local minimum in the TSVM optimization procedure while also being computationally attractive. We conduct an empirical study on several document classification tasks which confirms the value of our methods in large scale semi-supervised settings." } @InProceedings{Angelova:2006:GBT, author = "Angelova, Ralitsa andWeikum, Gerhard", title = "Graph-based text classification: learn from your neighbors", booktitle = "Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval", year = "2006", pages = "485--492", abstract = "Automatic classification of data items, based on training samples, can be boosted by considering the neighborhood of data items in a graph structure (e.g., neighboring documents in a hyperlink environment or co-authors and their publications for bibliographic data entries). This paper presents a new method for graph-based classification, with particular emphasis on hyperlinked text documents but broader applicability. Our approach is based on iterative relaxation labeling and can be combined with either Bayesian or SVM classifiers on the feature spaces of the given data items. The graph neighborhood is taken into consideration to exploit locality patterns while at the same time avoiding overfitting. In contrast to prior work along these lines, our approach employs a number of novel techniques: dynamically inferring the link/class pattern in the graph in the run of the iterative relaxation labeling, judicious pruning of edges from the neighborhood graph based on node dissimilarities and node degrees, weighting the influence of edges based on a distance metric between the classification labels of interest and weighting edges by content similarity measures. Our techniques considerably improve the robustness and accuracy of the classification outcome, as shown in systematic experimental comparisons with previously published methods on three different real-world datasets." } @InProceedings{Stein:2006:EOE, author = "Stein, Sterling Stuart and Argamon, Shlomo and Frieder, Opher", title = "The effect of OCR errors on stylistic text classification", booktitle = "Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval", year = "2006", pages = "701--702", abstract = "Recently, interest is growing in non-topical text classification tasks such as genre classification, sentiment analysis, and authorship profiling. We study to what extent OCR errors affect stylistic text classification from scanned documents. We find that even a relatively high level of errors in the OCRed documents does not substantially affect stylistic classification accuracy." } @InProceedings{Olsson:2006:ACT, author = "Olsson, J. Scott", title = "An analysis of the coupling between training set and neighborhood sizes for the kNN classifier", booktitle = "Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval", year = "2006", pages = "685--686", abstract = "We consider the relationship between training set size and the parameter k for the k-Nearest Neighbors (kNN) classifier. When few examples are available, we observe that accuracy is sensitive to k and that best k tends to increase with training size. We explore the subsequent risk that k tuned on partitions will be suboptimal after aggregation and re-training. This risk is found to be most severe when little data is available. For larger training sizes, accuracy becomes increasingly stable with respect to k and the risk decreases." } @InProceedings{Koppel:2006:AAT, author = "Koppel Moshe and Schler, Jonathan and Argamon, Shlomo and Messeri, Eran", title = "Authorship attribution with thousands of candidate authors", booktitle = "Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval", year = "2006", pages = "659--660", abstract = "In this paper, we use a blog corpus to demonstrate that we can often identify the author of an anonymous text even where there are many thousands of candidate authors. Our approach combines standard information retrieval methods with a text categorization meta-learning scheme that determines when to even venture a guess." } @InProceedings{Dayanik:2006:CIP, author = "Dayanik, Aynur and Lewis, David D. and Madigan, David and Menkov, Vladimir and Genkin, Alexander", title = "Constructing informative prior distributions from domain knowledge in text classification", booktitle = "Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval", year = "2006", pages = "493--500", abstract = "Supervised learning approaches to text classification are in practice often required to work with small and unsystematically collected training sets. The alternative to supervised learning is usually viewed to be building classifiers by hand, using a domain expert's understanding of which features of the text are related to the class of interest. This is expensive, requires a degree of sophistication about linguistics and classification, and makes it difficult to use combinations of weak predictors. We propose instead combining domain knowledge with training examples in a Bayesian framework. Domain knowledge is used to specify a prior distribution for the parameters of a logistic regression model, and labeled training data is used to produce a posterior distribution, whose mode we take as the final classifier. We show on three text categorization data sets that this approach can rescue what would otherwise be disastrously bad training situations, producing much more effective classifiers." } @InProceedings{Shen:2006:CIE, author = "Shen, Dou and Sun, Jian-Tao and Yang, Qiang and Chen, Zheng", title = "A Comparison of Implicit and Explicit Links for Web Page Classification", booktitle = "Proceedings of the 15th International Conference on World Wide Web", year = "2006", pages = "643--650", abstract = "It is well known that Web-page classification can be enhanced by using hyperlinks that provide linkages between Web pages. However, in the Web space, hyperlinks are usually sparse, noisy and thus in many situations can only provide limited help in classification. In this paper, we extend the concept of linkages from explicit hyperlinks to implicit links built between Web pages. By observing that people who search the Web with the same queries often click on different, but related documents together, we draw implicit links between Web pages that are clicked after the same queries. Those pages are implicitly linked. We provide an approach for automatically building the implicit links between Web pages using Web query logs, together with a thorough comparison between the uses of implicit and explicit links in Web page classification. Our experimental results on a large dataset confirm that the use of the implicit links is better than using explicit links in classification performance, with an increase of more than 10.5\% in terms of the Macro-F1 measurement." } @InProceedings{Cui:2006:CES, author = "Hang Cui and Vibhu Mittal and Mayur Datar", title = "Comparative Experiments on Sentiment Classification for Online Product Reviews", booktitle = "Proceedings of the 21st National Conference on Artificial Intelligence", year = "2006", month = "July", pages = "1265--1270", abstract = "Evaluating text fragments for positive and negative subjective expressions and their strength can be important in applications such as single- or multi- document summarization, document ranking, data mining, etc. This paper looks at a simplified version of the problem: classifying online product reviews into positive and negative classes. We discuss a series of experiments with different machine learning algorithms in order to experimentally evaluate various trade-offs, using approximately 100K product reviews from the web." } @InProceedings{Yin:2006:ANB, author = "Ling Yin and Richard Power", title = "Adapting the Naive Bayes Classifier to Rank Procedural Texts", booktitle = "Proceedings of the 28th European Conference on IR Research (ECIR)", year = "2006", pages = "179--190" } @InProceedings{Freschi:2006:FOE, author = "Valerio Freschi and Andrea Seraghiti and Alessandro Bogliolo", title = "Filtering Obfuscated Email Spam by means of Phonetic String Matching", booktitle = "Proceedings of the 28th European Conference on IR Research (ECIR)", year = "2006", pages = "505--509" } @InProceedings{Bouma:2006:SHT, author = "Bouma, Lucas and de Rijke, Maarten", title = "Specificity Helps Text Classification", booktitle = "Proceedings of the 28th European Conference on IR Research (ECIR)", year = "2006", pages = "539--542" } @InProceedings{Fu:2005:ATC, author = "Fu, Yueyu and Ke, Weimao and Mostafa, Javed", title = "Automated text classification using a multi-agent framework", booktitle = "Proceedings of the 5th ACM/IEEE-CS joint conference on Digital libraries", year = "2005", pages = "157--158", abstract = "Automatic text classification is an important operational problem in digital library practice. Most text classification efforts so far concentrated on developing centralized solutions. However, centralized classification approaches often are limited due to constraints on knowledge and computing resources. In addition, centralized approaches are more vulnerable to attacks or system failures and less robust in dealing with them. We present a de-centralized approach and system implementation (named MACCI) for text classification using a multi-agent framework. Experiments are conducted to compare our multi-agent approach with a centralized approach. The results show multi-agent classification can achieve promising classification results while maintaining its other advantages." } @InProceedings{Couto:2006:CSC, author = "Couto, Thierson and Cristo, Marco and Goncalves, Marcos Andre and Calado, Pavel and Ziviani, Nivio and Moura, Edleno and Ribeiro-Neto, Berthier", title = "A comparative study of citations and links in document classification", booktitle = "Proceedings of the 6th ACM/IEEE-CS joint conference on Digital libraries", year = "2006", pages = "75--84", abstract = "It is well known that links are an important source of information when dealing with Web collections. However, the question remains on whether the same techniques that are used on the Web can be applied to collections of documents containing citations between scientific papers. In this work we present a comparative study of digital library citations and Web links, in the context of automatic text classification. We show that there are in fact differences between citations and links in this context. For the comparison, we run a series of experiments using a digital library of computer science papers and a Web directory. In our reference collections, measures based on co-citation tend to perform better for pages in the Web directory, with gains up to 37\% over text based classifiers, while measures based on bibliographic coupling perform better in a digital library. We also propose a simple and effective way of combining a traditional text based classifier with a citation-link based classifier. This combination is based on the notion of classifier reliability and presented gains of up to 14\% in micro-averaged F1 in the Web collection. However, no significant gain was obtained in the digital library. Finally, a user study was performed to further investigate the causes for these results. We discovered that misclassifications by the citation-link based classifiers are in fact difficult cases, hard to classify even for humans." } @InProceedings{Gliozzo:2005:DKT, author = {Gliozzo, Alfio and Strapparava, Carlo}, title = {Domain Kernels for Text Categorization}, booktitle = {Proceedings of the Ninth Conference on Computational Natural Language Learning (CoNLL-2005)}, month = {June}, year = {2005}, address = {Ann Arbor, Michigan}, publisher = {Association for Computational Linguistics}, pages = {56--63}, url = {http://www.aclweb.org/anthology/W/W05/W05-0608} } @InProceedings{Fukumoto:2004:CNA, author = "Fumiyo Fukumoto and Yoshimi Suzuki", title = "Comparison of Manual and Automatic Constructions of Category Hierarchy for Classifying Large Corpora", booktitle = "Proceedings of CoNLL-2004", year = "2004", pages = "65--72", abstract = "We address the problem dealing with a large collection of data, and investigate the use of automatically constructing category hierarchy from a given set of categories to improve classification of large corpora. We use two well-known techniques, partitioning clustering, means and a hierarchy. -means is to cluster the given categories in a hierarchy. To select the proper number of , we use assures the degree of our disappointment in any differences between the true distribution over inputs and the learner's prediction. Once the optimal number of is selected, for eac ter, the procedure is repeated. Our evaluation using the 1996 Reuters corpus which consists of 806,791 documents shows that automatically constructing hierarchy improves classification accuracy." } @InProceedings{Ginter:2005:DCU, author = "Filip Ginter and Sampo Pyysalo and Tapio Salakoski", title = "Document Classification Using Semantic Networks with An Adaptive Similarity Measure", booktitle = "Recent Advances in Natural Language Processing", year = "2005", month = "September", address = "Borovets, Bulgaria" } @InProceedings{Mihalcea:2005:UET, author = "Rada Mihalcea and Samer Hassan", title = "Using the Essence of Texts to Improve Document Classification", booktitle = "Recent Advances in Natural Language Processing", year = "2005", month = "September", address = "Borovets, Bulgaria" } @InProceedings{Guo:2004:KMB, author = "Gongde Guo and Hui Wang and David Bell and Yaxin Bi and Kieran Greer", title = "An kNN Model-based Approach and its Application in Text Categorization", booktitle = "Computational Linguistics and Intelligent Text Processing (Lecture Notes in Computer Science, Vol. 2945)", year = "2004" } @InProceedings{Wenliang:2004:ALF, author = "Chen Wenliang and Zhu Jingbo and Wu Honglin and Yao Tianshun", title = "Automatic Learning Features Using Bootstrapping for Text Categorization", booktitle = "Computational Linguistics and Intelligent Text Processing (Lecture Notes in Computer Science, Vol. 2945)", year = "2004" } @InProceedings{Kim:2004:RCR, author = "Sang-Bum Kim and Hae-Chang Rim", title = "Recomputation of Class Relevance Scores for Improving Text Classification", booktitle = "Computational Linguistics and Intelligent Text Processing (Lecture Notes in Computer Science, Vol. 2945)", year = "2004" } @InProceedings{Xue:2004:RHD, author = "Dejun Xue and Maosong Sun", title = "Raising High-Degree Overlapped Character Bigrams into Trigrams for Dimensionality Reduction in Chinese Text Categorization", booktitle = "Computational Linguistics and Intelligent Text Processing (Lecture Notes in Computer Science, Vol. 2945)", year = "2004" } @InProceedings{Rosso:2004:IRT, author = "Paolo Rosso and Antonio Molina and Ferran Pla and Daniel Jimenez and Vicent Vidal", title = "Information Retrieval and Text Categorization with Semantic Indexing", booktitle = "Computational Linguistics and Intelligent Text Processing (Lecture Notes in Computer Science, Vol. 2945)", year = "2004" } @InProceedings{Schneider:2005:TIP, author = "Schneider, Karl-Michael", title = "Techniques for Improving the Performance of Naive Bayes for Text Classification", booktitle = "Computational Linguistics and Intelligent Text Processing", year = "2005" } @InProceedings{Pappuswamy:2005:SCM, author = "Pappuswamy, Umarani and Bhembe, Dumisizwe and Jordan, Pamela W. and VanLehn, Kurt", title = "A Supervised Clustering Method for Text Classification", booktitle = "Computational Linguistics and Intelligent Text Processing", year = "2005" } @InProceedings{Chowdhury:2005:UTC, author = "Nirmalya Chowdhury and Diganta Saha", title = "Unsupervised Text Classification using Kohonen’s Self Organizing Network", booktitle = "Computational Linguistics and Intelligent Text Processing", year = "2005" } @InProceedings{MoyotlHernandez:2005:EDF, author = "Edgar Moyotl-Hernandez and Hector Jimenez-Salaz", title = "Enhancement of DTP Feature Selection Method for Text Categorization", booktitle = "Computational Linguistics and Intelligent Text Processing", year = "2005" } @InProceedings{Xia:2005:FAE, author = "Yunqing Xia and Angelo Dalli and Yorick Wilks and Louise Guthrie", title = "{FASiL} Adaptive Email Categorization System", booktitle = "Computational Linguistics and Intelligent Text Processing", year = "2005" } @InProceedings{Anagnostopoulos:2006:EEC, author = "Aris Anagnostopoulos and Andrei Broder and Kunal Punera", title = "Effective and efficient classification on a search-engine model", booktitle = "CIKM", year = "2006" } @InProceedings{Qi:2006:KWP, author = "Xiaoguang Qi and Brian Davison", title = "Knowing a web page by the company it keeps", booktitle = "CIKM", year = "2006", pages = "228--237", } @InProceedings{Schuetze:2006:PTP, author = "Hinrich Schuetze and Emre Velipasaoglu and Jan Pedersen", title = "Performance thresholding in practical text classification", booktitle = "CIKM", year = "2006", pages = "662--671", } @InProceedings{Shen:2006:TCI, author = "Dou Shen and Jian-Tao Sun and Qiang Yang and Zheng Chen", title = "Text classification improved through multigram models", booktitle = "CIKM", year = "2006", pages = "672--681", } @InProceedings{Esuli:2006:DTS, author = "Andrea Esuli and Fabrizio Sebastiani", title = "Determining term subjectivity and term orientation for opinion mining", booktitle = "Proceedings of EACL-06, 11th Conference of the European Chapter of the Association for Computational Linguistics", year = "2006", pages = "193--200", } @InProceedings{Sandler:2005:OUL, author = "Mark Sandler", title = "On the use of linear programming for unsupervised text classification", booktitle = "Proceedings of the eleventh ACM SIGKDD international conference on knowledge discovery in data mining", year = "2005", pages = "256--264", } @InProceedings{Zhang:2006:LPM, author = "Zhang, Tong and Popescul, Alexandrin and Dom, Byron", title = "Linear prediction models with graph regularization for web-page categorization", booktitle = "Proceedings of the twelfth ACM SIGKDD international conference on knowledge discovery and data mining", year = "2006", pages = "821--826", } @InProceedings{Forman:2006:QTA, author = "Forman, George", title = "Quantifying trends accurately despite classifier error and class imbalance", booktitle = "Proceedings of the twelfth ACM SIGKDD international conference on knowledge discovery and data mining", year = "2006", pages = "157--166", } @InProceedings{Joachims:2006:TLS, author = "Joachims, Thorsten", title = "Training linear {SVM}s in linear time", booktitle = "Proceedings of the twelfth ACM SIGKDD international conference on knowledge discovery and data mining", year = "2006", pages = "217--226", } @InProceedings{Kumar:2006:HTS, author = "Kumar, Ravi and Punera, Kunal and Tomkins, Andrew", title = "Hierarchical topic segmentation of websites", booktitle = "Proceedings of the twelfth ACM SIGKDD international conference on knowledge discovery and data mining", year = "2006", pages = "257--266", } @InProceedings{Zhang:2006:HTS, author = "Zhang, Dell and Lee, Wee Sun", title = "Extracting key-substring-group features for text classification", booktitle = "Proceedings of the twelfth ACM SIGKDD international conference on knowledge discovery and data mining", year = "2006", pages = "474--483", } @InProceedings{Hulth:2006:SAE, author = "Hulth, Anette and Megyesi, Beata B.", title = "A Study on Automatically Extracted Keywords in Text Categorization", booktitle = "Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics", year = "2006" } @InProceedings{Li:2006:CSQ, author = "Li, Jingyang and Sun, Maosong and Zhang, Xian", title = "A Comparison and Semi-Quantitative Analysis of Words and Character-Bigrams as Features in Chinese Text Categorization", booktitle = "Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics", year = "2006" } @InProceedings{Gliozzo:2006:ECC, author = "Gliozzo, Alfio and Strapparava, Carlo", title = "Exploiting Comparable Corpora and Bilingual Dictionaries for Cross-Language Text Categorization", booktitle = "Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics", year = "2006", pages = "553--560" } @InProceedings{Lin:2006:ATD, author = "Wei-Hao Lin and Alexander Hauptmann", title = "Are These Documents Written from Different Perspectives? A Test of Different Perspectives Based On Statistical Distribution Divergence", booktitle = "Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics", year = "2006" } @InProceedings{Wiebe:2006:WSS, author = "Janyce Wiebe and Rada Mihalcea", title = "Word Sense and Subjectivity", booktitle = "Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics", year = "2006" } @Article{Fumera:2006:SFB, author = {Giorgio Fumera and Ignazio Pillai and Fabio Roli}, title = {Spam Filtering Based On The Analysis Of Text Information Embedded Into Images}, journal = {Journal of Machine Learning Research}, volume = {7}, pages = {2699--2720}, year = {2006} } @Article{Bratko:2006:SFU, author = {Bratko, Andrej and Filipic, Bogdan and Cormack, Gordon V. and Lynam, Thomas R. and Zupan, Blaz}, title = {Spam Filtering Using Statistical Data Compression Models}, journal = {Journal of Machine Learning Research}, volume = {7}, pages = {2673--2698}, year = {2006} } @Article{Bratko:2006:ESI, author = {Bratko, Andrej and Filipic, Bogdan}, title = {Exploiting Structural Information for Semi-structured Document Categorization}, journal = {Information Processing and Management}, volume = {42}, number = {3}, pages = {679--694}, year = {2006} } @InProceedings{Radovanovic:2006:DRC, author = {Milo\v{s} Radovanovi\'c and Mirjana Ivanovi\'c}, title = {Document Representations for Classification of Short {W}eb-page Descriptions}, booktitle = {Proceedings of DaWaK-06, 8th International Conference on Data Warehousing and Knowledge Discovery}, year = {2006}, pages = {544--553}, series = {Lecture Notes in Computer Science}, volume = {4081}, address = {Krakow, Poland}, publisher = {Springer-Verlag}, url = {http://perun.im.ns.ac.yu/radovanovic/publications/2006-dawak-docrep.pdf}, abstract = {Motivated by applying Text Categorization to sorting Web search results, this paper describes an extensive experimental study of the impact of bag-of-words document representations on the performance of five major classifiers -- Naive Bayes, SVM, Voted Perceptron, kNN and C4.5. The texts represent short Web-page descriptions from the dmoz Open Directory Web-page ontology. Different transformations of input data: stemming, normalization, logtf and idf, together with dimensionality reduction, are found to have a statistically significant improving or degrading effect on classification performance measured by classical metrics -- accuracy, precision, recall, F$_1$ and F$_2$. The emphasis of the study is not on determining the best document representation which corresponds to each classifier, but rather on describing the effects of every individual transformation on classification, together with their mutual relationships.} } @InProceedings{Radovanovic:2006:IBD, author = {Milo\v{s} Radovanovi\'c and Mirjana Ivanovi\'c}, title = {Interactions Between Document Representation and Feature Selection in Text Categorization}, booktitle = {Proceedings of DEXA-06, 17th International Conference on Database and Expert Systems Applications}, year = {2006}, pages = {489--498}, series = {Lecture Notes in Computer Science}, volume = {4080}, address = {Krakow, Poland}, publisher = {Springer-Verlag}, url = {http://perun.im.ns.ac.yu/radovanovic/publications/2006-dexa-idf.pdf}, abstract = {Many studies in automated Text Categorization focus on the performance of classifiers, with or without considering feature selection methods, but almost as a rule taking into account just one document representation. Only relatively recently did detailed studies on the impact of various document representations step into the spotlight, showing that there may be statistically significant differences in classifier performance even among variations of the classical bag-of-words model. This paper examines the relationship between the idf transform and several widely used feature selection methods, in the context of Naive Bayes and Support Vector Machines classifiers, on datasets extracted from the dmoz ontology of Web-page descriptions. The described experimental study shows that the idf transform considerably effects the distribution of classification performance over feature selection reduction rates, and offers an evaluation method which permits the discovery of relationships between different document representations and feature selection methods which is independent of absolute differences in classification performance.} } @InProceedings{Radovanovic:2006:CCM, author = {Milo\v{s} Radovanovi\'c and Mirjana Ivanovi\'c}, title = {Cat{S}: A Classification-Powered Meta-Search Engine}, booktitle = {Advances in Web Intelligence and Data Mining}, year = {2006}, editor = {Last, M. and Szczepaniak, P. S. and Volkovich, Z. and Kandel, A.}, pages = {191--200}, series = {Studies in Computational Intelligence}, volume = {23}, publisher = {Springer-Verlag}, url = {http://perun.im.ns.ac.yu/radovanovic/publications/2006-awic-cats.pdf}, abstract = {CatS is a meta-search engine that utilizes text classification techniques to improve the presentation of search results. After posting a query, the user is offered an opportunity to refine the results by browsing through a category tree derived from the dmoz Open Directory topic hierarchy. This paper describes some key aspects of the system (including HTML parsing, classification and displaying of results), outlines the text categorization experiments performed in order to choose the right parameters for classification, and puts the system into the context of related work on (meta-)search engines. The approach of using a separate category tree represents an extension of the standard relevance list, and provides a way to refine the search on need, offering the user a non-imposing, but potentially powerful tool for locating needed information quickly and efficiently. The current implementation of CatS may be considered a baseline, on top of which many enhancements are possible.} } @InProceedings{Kibriya:2004:MNB, author = {Ashraf M. Kibriya and Eibe Frank and Bernhard Pfahringer and Geoffrey Holmes}, title = {Multinomial Naive Bayes for Text Categorization Revisited}, booktitle = {Proceedings of AI-04, 17th Australian Joint Conference on Artificial Intelligence}, year = {2004}, pages = {488--499}, address = {Cairns, Australia}, series = {Lecture Notes in Artificial Intelligence}, volume = {3339}, publisher = {Springer-Verlag}, url = {http://www.cs.waikato.ac.nz/~eibe/pubs/kibriya_et_al_cr.ps.gz}, abstract = {This paper presents empirical results for several versions of the multinomial naive Bayes classifier on four text categorization problems, and a way of improving it using locally weighted learning. More specifically, it compares standard multinomial naive Bayes to the recently proposed transformed weight-normalized complement naive Bayes classifier (TWCNB) [1], and shows that some of the modifications included in TWCNB may not be necessary to achieve optimum performance on some datasets. However, it does show that TFIDF conversion and document length normalization are important. It also shows that support vector machines can, in fact, sometimes very significantly outperform both methods. Finally, it shows how the performance of multinomial naive Bayes can be improved using locally weighted learning. However, the overall conclusion of our paper is that support vector machines are still the method of choice if the aim is to maximize accuracy.} } @InProceedings{Makrehchi:2005:TCU, author = {Masoud Makrehchi and Mohamed S. Kamel}, title = {Text Classification Using Small Number of Features}, booktitle = {Proceedings of MLDM-05, 4th International Conference on Machine Learning and Data Mining in Pattern Recognition}, year = {2005}, pages = {580--589}, address = {Leipzig, Germany}, series = {Lecture Notes in Artificial Intelligence}, volume = {3587}, publisher = {Springer-Verlag}, url = {http://www.springerlink.com/content/4ytxvxmjea83ctqv/}, abstract = {Feature selection method for text classification based on information gain ranking, improved by removing redundant terms using mutual information measure and inclusion index, is proposed. We report an experiment to study the impact of term redundancy on the performance of text classifier. The result shows that term redundancy behaves very similar to noise and may degrade the classifier performance. The proposed method is tested on an SVM text classifier. Feature reduction by this method remarkably outperforms information gain based feature selection.} } @InProceedings{Kules:2006:CWS, author = "Kules, Bill and Kustanowitz, Jack and Scneiderman, Ben", title = "Categorizing web search results into meaningful and stable categories using fast-feature techniques", booktitle = "Proceedings of the 6th ACM/IEEE-CS joint conference on Digital libraries", year = "2006", pages = "210--219", abstract = "When search results against digital libraries and web resources have limited metadata, augmenting them with meaningful and stable category information can enable better overviews and support user exploration. This paper proposes six fast-feature techniques that use only features available in the search result list, such as title, snippet, and URL, to categorize results into meaningful categories. They use credible knowledge resources, including a US government organizational hierarchy, a thematic hierarchy from the Open Directory Project (ODP) web directory, and personal browse histories, to add valuable metadata to search results. In three tests the percent of results categorized for five representative queries was high enough to suggest practical benefits: general web search (76-90\%), government web search (39-100\%), and the Bureau of Labor Statistics website (48-94\%). An additional test submitted 250 TREC queries to a search engine and successfully categorized 66\% of the top 100 using the ODP and 61\% of the top 350. Fast-feature techniques have been implemented in a prototype search engine. We propose research directions to improve categorization rates and make suggestions about how web site designers could re-organize their sites to support fast categorization of search results." } @InProceedings{Wang:2005:WOI, author = "Wang, Muyuan and Li, Zhiwei and Lu, Lie and Ma, Wei-Ying and Zhang, Naiyao", title = "Web object indexing using domain knowledge", booktitle = "Proceedings of the eleventh ACM SIGKDD international conference on knowledge discovery in data mining", year = "2005", pages = "294--303", abstract = "A web object is defined to represent any meaningful object embedded in web pages (e.g. images, music) or pointed to by hyperlinks (e.g. downloadable files). In many cases, users would like to search for information of a certain 'object', rather than a web page containing the query terms. To facilitate web object searching and organizing, in this paper, we propose a novel approach to web object indexing, by discovering its inherent structure information with existed domain knowledge. In our approach, first, Layered LSI spaces are built for a better representation of the hierarchically structured domain knowledge, in order to emphasize the specific semantics and term space in each layer of the domain knowledge. Meanwhile, the web object representation is constructed by hyperlink analysis, and further pruned to remove the noises. Then an optimal matching between the web object and the domain knowledge is performed, in order to pick out the structure attributes of the web object from the knowledge. Finally, the obtained structure attributes are used to re-organize and index the web objects. Our approach also indicates a new promising way to use trust-worthy Deep Web knowledge to help organize dispersive information of Surface Web." } @InProceedings{Kolcz:2007:AMR, author = "Kolcz, Aleksander and Chowdhury, Abdur", title = "Avoidance of Model Re-Induction in SVM-based Feature Selection for Text Categorization", booktitle = "Proceedings of the International Joint Conference on Artificial Intelligence", year = "2007", pages = "889--894", address = "Hyderabad, India" } @InProceedings{Dai:2007:TNB, author = "Wenyuan Dai and Gui-Rong Xue and Qiang Yang and Yong Yu", title = "Transferring Naive Bayes Classifiers for Text Classification", booktitle = "Proceedings of the Twenty-Second AAAI Conference on Artificial Intelligence", year = "2007", month = "July", address = "Vancouver, British Columbia, Canada" } @InProceedings{Betts:2007:UIE, author = "Betts, Tom and Milosavljevic, Maria and Oberlander, Jon", title = "The Utility of Information Extraction in the Classification of Books", booktitle = "Proceedings of the 29th European Conference on Information Retrieval", year = "2007", month = "April", address = "Rome, Italy" } @InProceedings{Bloehdorn:2007:CSS, author = "Stephan Bloehdorn and Alessandro Moschitti", title = "Combined Syntactic and Semanitc Kernels for Text Classification", booktitle = "Proceedings of the 29th European Conference on Information Retrieval", year = "2007", month = "April", address = "Rome, Italy" } @InProceedings{Davy:2007:ALH, author = "Michael Davy and Saturnino Luz", title = "Active Learning with History-Based Query Selection for Text Categorisation", booktitle = "Proceedings of the 29th European Conference on Information Retrieval", year = "2007", month = "April", address = "Rome, Italy" } @InProceedings{He:2007:INB, author = "Feng He and Xiaoqing Ding", title = "Improving Naive Bayes Text Classifier Using Smoothing Methods", booktitle = "Proceedings of the 29th European Conference on Information Retrieval", year = "2007", month = "April", address = "Rome, Italy" } @InProceedings{deMelo:2007:MTC, author = "de Melo, Gerard and Siersdorfer, Stefan", title = "Multilingual Text Classification using Ontologies", booktitle = "Proceedings of the 29th European Conference on Information Retrieval", year = "2007", month = "April", address = "Rome, Italy" } @InProceedings{Li:2007:STS, author = "Jingyang Li and Maosong Sun", title = "Scalable Term Selection for Text Categorization", booktitle = "Proceedings of the 29th European Conference on Information Retrieval", year = "2007", month = "June", address = "Prague, Czech Republic" } @InProceedings{Kim:2007:CAP, author = "Soo-Min Kim and Eduard Hovy", title = "Crystal: Analyzing Predictive Opinions on the Web", booktitle = "Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning", year = "2007", month = "June", address = "Prague, Czech Republic" } @InProceedings{Blitzer:2007:BBB, author = "John Blitzer and Mark Dredze and Fernando Pereira", title = "Biographies, Bollywood, Boom-boxes and Blenders: Domain Adaptation for Sentiment Classification", booktitle = "Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics", year = "2007", month = "June", address = "Prague, Czech Republic" } @InProceedings{Li:2007:ECI, author = "Jingyang Li and Maosong Sun", title = "Exploiting Category Information and Document Information to Improve Term Weighting for Text Categorization", booktitle = "Proceedings of the Eighth International Conference on Intelligent Text Processing and Computational Linguistics", year = "2007", month = "February", address = "Mexico City, Mexico" } @InProceedings{Cleuziou:2007:OLL, author = "Guillaume Cleuziou and Celine Poudat", title = "On the impact of Lexical and Linguistic features in Genre and Domain-Based Text Categorization", booktitle = "Proceedings of the Eighth International Conference on Intelligent Text Processing and Computational Linguistics", year = "2007", month = "February", address = "Mexico City, Mexico" } @InProceedings{Karakos:2007:TJR, author = "Karakos, Damianos and Eisner, Jason and Khudanpur, Sanjeev and Priebe, Carey E.", title = "Tuning Jensen-Renyi Divergences with Statistically Similar Examples for Unsupervised Document Categorization via Iterative Denoising Trees", booktitle = "Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics", year = "2007", month = "April", address = "Rochester, NY" } @InProceedings{Zaidan:2007:UAR, author = "Omar Zaidan and Jason Eisner and Christine Piatko", title = "Using ``Annotator Rationales'' to Improve Machine Learning for Text Categorization", booktitle = "Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics", year = "2007", month = "April", address = "Rochester, NY" } @Article{Zelikovitz:2007:EWB, author = {Zelikovitz, Sarah and Cohen, William W. and Hirsh, Haym}, title = {Extending WHIRL with background knowledge for improved text classification}, journal = {Information Retrieval}, volume = {10}, number = {1}, pages = {35--67}, year = {2007}, month = {January} } @Article{Serrano:2007:ELD, author = {Serrano, J.I. and del Castillo, M.D.}, title = {Evolutionary learning of document categories}, journal = {Information Retrieval}, volume = {10}, number = {1}, pages = {69--83}, year = {2007}, month = {January} } @Article{Ceci:2007:CWD, author = {Michelangelo Ceci and Donato Malerba}, title = {Classifying web documents in a hierarchy of categories: a comprehensive study}, journal = {Journal of Intelligent Information Systems}, volume = {28}, number = {1}, pages = {37--78}, year = {2007}, month = {February} } @Article{Koppel:2007:MDU, author = {Koppel, Moshe and Schler, Jonathan and Bonchek-Dokow, Elisheva}, title = {Measuring Differentiability: Unmasking Pseudonymous Authors}, journal = {Journal of Machine Learning Research}, volume = {8}, pages = {1261--1276}, year = {2007}, month = {June} }