Automatically assigned DDC number: 006312

Manually assigned DDC number: 006312

Number of references: 8

Title: Exploiting Structural Information for Text Classification on the WWW

Author:

Subject: Johannes Furnkranz Exploiting Structural Information for Text Classification on the WWW

Description: . In this paper, we report on a set of experiments that explore the utility of making use of the structural information of WWW documents. Our working hypothesis is that it is often easier to classify a hypertext page using information provided on pages that point to it instead of using information that is provided on the page itself. We present experimental evidence that confirms this hypothesis on a set of Web-pages that relate to Computer Science Departments. 1 Introduction The advent of the World-Wide Web has rejuvinated the interest in text categorization problems. Vast amounts of documents are available on-line, and categorizing them into meaningful semantic categories is a rewarding and challenging research problem. However, current approaches to text categorization on the Web mostly concentrate on simple representation schemes that are based on word occurrence and word frequency. The structural information that is inherent to documents on the Web is often neglected. There are a...

Contributor: The Pennsylvania State University CiteSeer Archives

Publisher: unknown

Date: 1999-05-05

Pubyear: 1999

Format: ps

Identifier: http://citeseer.ist.psu.edu/173894.html

Source: http://www.ai.univie.ac.at/~juffi/publications/ida-99.ps.gz

Language: en

Relation:

Relation:

Relation:

Relation:

Relation:

Relation:

Relation:

Relation:

Rights: unrestricted

Graph

<?xml   version="1.0"   encoding="UTF-8"?>

<references_metadata>

      <rec   ID="/54411.html"   Type="inproceedings"   CiteSeer_Book="Proc   of   the   12th   International   Conference   on   Machine   Learning"   CiteSeer_Volume=""   Title="Fast   Effective   Rule   Induction,">

            <identifier   Org="ISBN:0387333339"   Paper_ID="/54411.html"   Extracted="0387333339"   DDC="006.312"   Normalized_DDC="006312"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:0387699341"   Paper_ID="/54411.html"   Extracted="0387699341"   DDC="006.312"   Normalized_DDC="006312"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:0898715458"   Paper_ID="/54411.html"   Extracted="0898715458"   DDC="006.3/12"   Normalized_DDC="006312"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:0898715938"   Paper_ID="/54411.html"   Extracted="0898715938"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:1558603778"   Paper_ID="/54411.html"   Extracted="1558603778"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:1591404509"   Paper_ID="/54411.html"   Extracted="1591404509"   DDC="006.3/3"   Normalized_DDC="00633"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:1605660108"   Paper_ID="/54411.html"   Extracted="1605660108"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540222189"   Paper_ID="/54411.html"   Extracted="3540222189"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540231056"   Paper_ID="/54411.html"   Extracted="3540231056"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540236627"   Paper_ID="/54411.html"   Extracted="3540236627"   DDC="005.75/8"   Normalized_DDC="005758"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540265430"   Paper_ID="/54411.html"   Extracted="3540265430"   DDC="006.3/12"   Normalized_DDC="006312"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540287957"   Paper_ID="/54411.html"   Extracted="3540287957"   DDC="519.5"   Normalized_DDC="5195"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540403000"   Paper_ID="/54411.html"   Extracted="3540403000"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540408134"   Paper_ID="/54411.html"   Extracted="3540408134"   DDC="519.5"   Normalized_DDC="5195"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540425381"   Paper_ID="/54411.html"   Extracted="3540425381"   DDC="005.1/15"   Normalized_DDC="005115"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540430601"   Paper_ID="/54411.html"   Extracted="3540430601"   DDC="519.5"   Normalized_DDC="5195"   Normalized_Weight="0.058823529411764705"   />

            <identifier   Org="ISBN:3540681248"   Paper_ID="/54411.html"   Extracted="3540681248"   />

            <identifier   Org="ISBN:3540695729"   Paper_ID="/54411.html"   Extracted="3540695729"   />

            <identifier   Org="ISBN:3540753893"   Paper_ID="/54411.html"   Extracted="3540753893"   DDC="006.31"   Normalized_DDC="00631"   Normalized_Weight="0.058823529411764705"   />

      </rec>

      <rec   ID="/56507.html"   Type="inproceedings"   CiteSeer_Book="AAAIIAAI   Vol   1"   CiteSeer_Volume=""   Title="Learning   Trees   and   Rules   with   Set-Valued   Features,">

            <identifier   Org="ISBN:0124438814"   Paper_ID="/56507.html"   Extracted="0124438814"   DDC="006.3/3"   Normalized_DDC="00633"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:0262510987"   Paper_ID="/56507.html"   Extracted="0262510987"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:0769510469"   Paper_ID="/56507.html"   Extracted="0769510469"   />

            <identifier   Org="ISBN:1558607072"   Paper_ID="/56507.html"   Extracted="1558607072"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:1558608699"   Paper_ID="/56507.html"   Extracted="1558608699"   />

            <identifier   Org="ISBN:1581137230"   Paper_ID="/56507.html"   Extracted="1581137230"   />

            <identifier   Org="ISBN:3540263195"   Paper_ID="/56507.html"   Extracted="3540263195"   DDC="006.32"   Normalized_DDC="00632"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540281770"   Paper_ID="/56507.html"   Extracted="3540281770"   DDC="005.1/15"   Normalized_DDC="005115"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540292306"   Paper_ID="/56507.html"   Extracted="3540292306"   DDC="501"   Normalized_DDC="501"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540298495"   Paper_ID="/56507.html"   Extracted="3540298495"   DDC="621.392"   Normalized_DDC="621392"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540404333"   Paper_ID="/56507.html"   Extracted="3540404333"   DDC="006.3/33"   Normalized_DDC="006333"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540418261"   Paper_ID="/56507.html"   Extracted="3540418261"   DDC="025.04"   Normalized_DDC="02504"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540420274"   Paper_ID="/56507.html"   Extracted="3540420274"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540633464"   Paper_ID="/56507.html"   Extracted="3540633464"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540647384"   Paper_ID="/56507.html"   Extracted="3540647384"   DDC="005.1/15"   Normalized_DDC="005115"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540663320"   Paper_ID="/56507.html"   Extracted="3540663320"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540672273"   Paper_ID="/56507.html"   Extracted="3540672273"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.07142857142857142"   />

            <identifier   Org="ISBN:3540717021"   Paper_ID="/56507.html"   Extracted="3540717021"   />

            <identifier   Org="ISBN:9042012579"   Paper_ID="/56507.html"   Extracted="9042012579"   />

      </rec>

      <rec   ID="/90349.html"   Type="misc"   CiteSeer_Book=""   CiteSeer_Volume=""   Title="Using   statistical   and   relational   methods   to   characterize   hyperlink   paths,">

            <identifier   Org="ISBN:3540663320"   Paper_ID="/90349.html"   Extracted="3540663320"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="1.0"   />

      </rec>

      <rec   ID="/124233.html"   Type="inproceedings"   CiteSeer_Book="Proceedings   of   AAAI98   15th   Conference   of   the   American   Association   for   Artificial   Intelligence"   CiteSeer_Volume=""   Title="Learning   to   extract   symbolic   knowledge   from   the   {W}orld   {W}ide   {W}eb,"   />

      <rec   ID="/156523.html"   Type="article"   CiteSeer_Book="The   AI   Magazine"   CiteSeer_Volume="18"   Title="Machine-Learning   Research:   Four   Current   Directions,">

            <identifier   Org="ISBN:0262012111"   Paper_ID="/156523.html"   Extracted="0262012111"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:0470116625"   Paper_ID="/156523.html"   Extracted="0470116625"   DDC="572.80285/61"   Normalized_DDC="5728028561"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:1402073887"   Paper_ID="/156523.html"   Extracted="1402073887"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:1577350030"   Paper_ID="/156523.html"   Extracted="1577350030"   DDC="658.5"   Normalized_DDC="6585"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540205896"   Paper_ID="/156523.html"   Extracted="3540205896"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540231056"   Paper_ID="/156523.html"   Extracted="3540231056"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540306765"   Paper_ID="/156523.html"   Extracted="3540306765"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540366679"   Paper_ID="/156523.html"   Extracted="3540366679"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540404082"   Paper_ID="/156523.html"   Extracted="3540404082"   DDC="006.3/2"   Normalized_DDC="00632"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:354041066X"   Paper_ID="/156523.html"   Extracted="354041066X"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540425535"   Paper_ID="/156523.html"   Extracted="3540425535"   DDC="658.4/038/0285574"   Normalized_DDC="65840380285574"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540425551"   Paper_ID="/156523.html"   Extracted="3540425551"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540659072"   Paper_ID="/156523.html"   Extracted="3540659072"   DDC="003/.3"   Normalized_DDC="0033"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:354065965X"   Paper_ID="/156523.html"   Extracted="354065965X"   DDC="006.3/3"   Normalized_DDC="00633"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540665587"   Paper_ID="/156523.html"   Extracted="3540665587"   DDC="025/.00285"   Normalized_DDC="02500285"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540665994"   Paper_ID="/156523.html"   Extracted="3540665994"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540679774"   Paper_ID="/156523.html"   Extracted="3540679774"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540679936"   Paper_ID="/156523.html"   Extracted="3540679936"   DDC="510   s"   Normalized_DDC="51"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:3540728465"   Paper_ID="/156523.html"   Extracted="3540728465"   DDC="006.4/2"   Normalized_DDC="00642"   Normalized_Weight="0.05"   />

            <identifier   Org="ISBN:9051994753"   Paper_ID="/156523.html"   Extracted="9051994753"   DDC="621.36/7"   Normalized_DDC="621367"   Normalized_Weight="0.05"   />

      </rec>

      <rec   ID="/174218.html"   Type="Article"   CiteSeer_Book="Machine   Learning"   CiteSeer_Volume="27"   Title="Pruning   Algorithms   for   Rule   Learning,">

            <identifier   Org="ISBN:0769506259"   Paper_ID="/174218.html"   Extracted="0769506259"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:0898715687"   Paper_ID="/174218.html"   Extracted="0898715687"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:1402043775"   Paper_ID="/174218.html"   Extracted="1402043775"   />

            <identifier   Org="ISBN:155860586X"   Paper_ID="/174218.html"   Extracted="155860586X"   />

            <identifier   Org="ISBN:1558607781"   Paper_ID="/174218.html"   Extracted="1558607781"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:1586034529"   Paper_ID="/174218.html"   Extracted="1586034529"   />

            <identifier   Org="ISBN:3540001700"   Paper_ID="/174218.html"   Extracted="3540001700"   DDC="005.1"   Normalized_DDC="0051"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:3540005676"   Paper_ID="/174218.html"   Extracted="3540005676"   DDC="005.1/15"   Normalized_DDC="005115"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:3540219374"   Paper_ID="/174218.html"   Extracted="3540219374"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:3540231056"   Paper_ID="/174218.html"   Extracted="3540231056"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:3540419101"   Paper_ID="/174218.html"   Extracted="3540419101"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:3540628584"   Paper_ID="/174218.html"   Extracted="3540628584"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:3540663320"   Paper_ID="/174218.html"   Extracted="3540663320"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:3540714405"   Paper_ID="/174218.html"   Extracted="3540714405"   />

            <identifier   Org="ISBN:3540859276"   Paper_ID="/174218.html"   Extracted="3540859276"   />

            <identifier   Org="ISBN:9810246846"   Paper_ID="/174218.html"   Extracted="9810246846"   DDC="621.399"   Normalized_DDC="621399"   Normalized_Weight="0.08333333333333333"   />

            <identifier   Org="ISBN:9810247532"   Paper_ID="/174218.html"   Extracted="9810247532"   DDC="658.4/033"   Normalized_DDC="6584033"   Normalized_Weight="0.08333333333333333"   />

      </rec>

      <rec   ID="/153148.html"   Type="misc"   CiteSeer_Book=""   CiteSeer_Volume=""   Title="Using   Links   for   Classifying   Web-pages,">

            <identifier   Org="ISBN:1581137044"   Paper_ID="/153148.html"   Extracted="1581137044"   DDC="006.7"   Normalized_DDC="0067"   Normalized_Weight="0.3333333333333333"   />

            <identifier   Org="ISBN:3540408088"   Paper_ID="/153148.html"   Extracted="3540408088"   DDC="381/.142/028558"   Normalized_DDC="381142028558"   Normalized_Weight="0.3333333333333333"   />

            <identifier   Org="ISBN:3540663320"   Paper_ID="/153148.html"   Extracted="3540663320"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.3333333333333333"   />

      </rec>

      <rec   ID="/9707.html"   Type="misc"   CiteSeer_Book=""   CiteSeer_Volume=""   Title="A   Case   Study   in   Using   Linguistic   Phrases   for   Text   Categorization   on   the   {WWW},">

            <identifier   Org="ISBN:0387244352"   Paper_ID="/9707.html"   Extracted="0387244352"   DDC="006.3/12"   Normalized_DDC="006312"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:0769507107"   Paper_ID="/9707.html"   Extracted="0769507107"   />

            <identifier   Org="ISBN:0792373499"   Paper_ID="/9707.html"   Extracted="0792373499"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:0792376560"   Paper_ID="/9707.html"   Extracted="0792376560"   DDC="005.2/76"   Normalized_DDC="005276"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:079237679X"   Paper_ID="/9707.html"   Extracted="079237679X"   DDC="005"   Normalized_DDC="005"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:0826491812"   Paper_ID="/9707.html"   Extracted="0826491812"   DDC="418/.020285"   Normalized_DDC="418020285"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:158113231X"   Paper_ID="/9707.html"   Extracted="158113231X"   />

            <identifier   Org="ISBN:1600217001"   Paper_ID="/9707.html"   Extracted="1600217001"   DDC="401/.410285"   Normalized_DDC="401410285"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:1845640179"   Paper_ID="/9707.html"   Extracted="1845640179"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:3540211233"   Paper_ID="/9707.html"   Extracted="3540211233"   DDC="006.3/2"   Normalized_DDC="00632"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:3540213317"   Paper_ID="/9707.html"   Extracted="3540213317"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:3540213821"   Paper_ID="/9707.html"   Extracted="3540213821"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:3540425365"   Paper_ID="/9707.html"   Extracted="3540425365"   DDC="006.3/1"   Normalized_DDC="00631"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:3540663320"   Paper_ID="/9707.html"   Extracted="3540663320"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.07692307692307693"   />

            <identifier   Org="ISBN:3540673059"   Paper_ID="/9707.html"   Extracted="3540673059"   DDC="006.3/2"   Normalized_DDC="00632"   Normalized_Weight="0.07692307692307693"   />

      </rec>

      <rec   ID="SELF"   Type="SELF"   CiteSeer_Book="SELF"   CiteSeer_Volume="SELF"   Title="Exploiting   Structural   Information   for   Text   Classification   on   the   WWW">

            <identifier   Org="ISBN:0521836573"   Paper_ID="SELF"   Extracted="0521836573"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:076374137X"   Paper_ID="SELF"   Extracted="076374137X"   DDC="006.3/3"   Normalized_DDC="00633"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:1402037678"   Paper_ID="SELF"   Extracted="1402037678"   DDC="025.04"   Normalized_DDC="02504"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:1402081502"   Paper_ID="SELF"   Extracted="1402081502"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:1581135939"   Paper_ID="SELF"   Extracted="1581135939"   />

            <identifier   Org="ISBN:1581137230"   Paper_ID="SELF"   Extracted="1581137230"   />

            <identifier   Org="ISBN:1595931406"   Paper_ID="SELF"   Extracted="1595931406"   />

            <identifier   Org="ISBN:3540201777"   Paper_ID="SELF"   Extracted="3540201777"   DDC="005.74"   Normalized_DDC="00574"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:3540250573"   Paper_ID="SELF"   Extracted="3540250573"   DDC="006.312"   Normalized_DDC="006312"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:3540315888"   Paper_ID="SELF"   Extracted="3540315888"   DDC="006.7"   Normalized_DDC="0067"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:354040726X"   Paper_ID="SELF"   Extracted="354040726X"   DDC="025/.00285"   Normalized_DDC="02500285"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:3540663320"   Paper_ID="SELF"   Extracted="3540663320"   DDC="006.3"   Normalized_DDC="0063"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:3540691367"   Paper_ID="SELF"   Extracted="3540691367"   DDC="006.33"   Normalized_DDC="00633"   Normalized_Weight="0.1"   />

            <identifier   Org="ISBN:3540733442"   Paper_ID="SELF"   Extracted="3540733442"   />

      </rec>

</references_metadata>

www.000webhost.com