Automatically assigned DDC number: 006312

Manually assigned DDC number: 006312

Number of references: 0

Title: Efficient Text Categorization

Author:

Subject: Marko Grobelnik Efficient Text Categorization

Description: We present an approach to text categorization using machine learning techniques. The approach is developed and tested on large text hierarchy named Yahoo that is available on the Web. We handle the large number of features and training examples by taking into account hierarchical structure of examples and using feature subset selection for large text data. The large number of categories is handled separately for each testing example by pruning unpromising categories. In this way, the number of categories to be considered is cut to less than a half without degrading the system performance. Our experiments are performed using naive Bayesian classifier on text data using feature-vector document representation that includes n-grams instead of just single words (unigrams). Experimental evaluation on three domains constructed from Yahoo hierarchy shows that among several hundred categories the correct category is assigned probability over 0.99 when rather small number of features used. 1 Int...

Contributor: The Pennsylvania State University CiteSeer Archives

Publisher: unknown

Date: 1998-03-04

Pubyear: 1998

Format: ps

Identifier: http://citeseer.ist.psu.edu/140838.html

Source: http://www.cs.cmu.edu/~TextLearning/pww/papers/PWW/pwwWshECML98.ps.gz

Language: en

Rights: unrestricted

Graph

<?xml   version="1.0"   encoding="UTF-8"?>

<references_metadata>

      <rec   ID="SELF"   Type="SELF"   CiteSeer_Book="SELF"   CiteSeer_Volume="SELF"   Title="Efficient   Text   Categorization">

            <identifier   Org="ISBN:1402040423"   Paper_ID="SELF"   Extracted="1402040423"   DDC="620.8/2"   Normalized_DDC="62082"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:1581131461"   Paper_ID="SELF"   Extracted="1581131461"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:1581137230"   Paper_ID="SELF"   Extracted="1581137230"   />

            <identifier   Org="ISBN:1586037749"   Paper_ID="SELF"   Extracted="1586037749"   DDC="610.285"   Normalized_DDC="610285"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:1591400511"   Paper_ID="SELF"   Extracted="1591400511"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:1853128066"   Paper_ID="SELF"   Extracted="1853128066"   DDC="006.3/12"   Normalized_DDC="006312"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540201807"   Paper_ID="SELF"   Extracted="3540201807"   DDC="004.67/8"   Normalized_DDC="004678"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540209719"   Paper_ID="SELF"   Extracted="3540209719"   DDC="025.04"   Normalized_DDC="02504"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540232583"   Paper_ID="SELF"   Extracted="3540232583"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540283129"   Paper_ID="SELF"   Extracted="3540283129"   DDC="006.33"   Normalized_DDC="00633"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540334726"   Paper_ID="SELF"   Extracted="3540334726"   DDC="025.04"   Normalized_DDC="02504"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540408134"   Paper_ID="SELF"   Extracted="3540408134"   DDC="519.5"   Normalized_DDC="5195"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:354041066X"   Paper_ID="SELF"   Extracted="354041066X"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540437819"   Paper_ID="SELF"   Extracted="3540437819"   DDC="670/.285/63"   Normalized_DDC="67028563"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540440380"   Paper_ID="SELF"   Extracted="3540440380"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07142857142857142"   />

      </rec>

</references_metadata>

www.000webhost.com