%!PS-Adobe-3.0 %%Title: (Microsoft Word - Witten Bray Mahoui Teahan 1999) %%Creator: (Microsoft Word: LaserWriter 8 Z1-8.5.1) %%CreationDate: (12:41 PM Monday, 4 October 1999) %%For: (Bronwyn) %%Routing: (mailto:\000bronwyn@cs.waikato.ac.nz) %%Pages: 1 %%DocumentFonts: TimesNewRomanPSMT TimesNewRomanPS-BoldMT Arial-BoldMT TimesNewRomanPS-ItalicMT %%DocumentNeededFonts: TimesNewRomanPSMT TimesNewRomanPS-BoldMT Arial-BoldMT TimesNewRomanPS-ItalicMT %%DocumentSuppliedFonts: %%DocumentData: Clean7Bit %%PageOrder: Ascend %%Orientation: Portrait %%DocumentMedia: Default 595 842 0 () () %ADO_ImageableArea: 17 22 577 819 %%EndComments %%BeginDefaults %%ViewingOrientation: 1 0 0 1 %%EndDefaults userdict/dscInfo 5 dict dup begin /Title(Microsoft Word - Witten Bray Mahoui Teahan 1999)def /Creator(Microsoft Word: LaserWriter 8 Z1-8.5.1)def /CreationDate(12:41 PM Monday, 4 October 1999)def /For(Bronwyn)def /Pages 1 def end put /md 188 dict def md begin/currentpacking where {pop /sc_oldpacking currentpacking def true setpacking}if %%BeginFile: lw8_basic-2.3 %%Copyright: Copyright 1990-1997 Adobe Systems Incorporated and Apple Computer Incorporated. All Rights Reserved. /bd{bind def}bind def /xdf{exch def}bd /xs{exch store}bd /ld{load def}bd /Z{0 def}bd /T/true /F/false /:L/lineto /lw/setlinewidth /:M/moveto /rl/rlineto /rm/rmoveto /:C/curveto /:T/translate /:K/closepath /:mf/makefont /gS/gsave /gR/grestore /np/newpath 14{ld}repeat /framewidth -1 def /QDframwid -1 def /numframes Z /mTS matrix def /$m matrix def /av 85 def /por T def /normland F def /psb-nosave{}def /pse-nosave{}def /us Z /psb{/us save store}bd /pse{us restore}bd /level2 /languagelevel where { pop languagelevel 2 ge }{ F }ifelse def /level3 /languagelevel where { pop languagelevel 3 ge }{ F }ifelse def /odictstk Z /oopstk Z /fcl { count oopstk sub dup 0 gt { {pop}repeat }{ pop }ifelse countdictstack odictstk sub dup 0 gt { {end}repeat }{ pop }ifelse }bd /sfcl2 { /odictstk countdictstack store count/oopstk xs }bd /efcl2 { stopped{$error/newerror F put}if fcl }bd /noload Z /startnoload { {/noload save store}if }bd /endnoload { {noload restore}if }bd level2 startnoload /setjob { statusdict/jobname 3 -1 roll put }bd /setcopies { userdict/#copies 3 -1 roll put }bd /devg/DeviceGray def /devr/DeviceRGB def /devc/DeviceCMYK def /ststpgdev{}def /dopgdev{}def /stpgdev{}def /buf Z /didstop T def /sfcl { /didstop T store /odictstk countdictstack store count/oopstk xs currentfile cvx stopped { $error/newerror F put didstop { save/didstop xs /buf vmstatus exch sub exch pop dup 0 lt{pop 0}if dup 64000 gt{pop 64000}if string store { currentfile buf readline { (}efcl)eq{exit}if }{ /UnexpectedEOF errordict/rangecheck get exec }ifelse }loop didstop restore }if }if fcl }bd /efcl { /didstop F store exec stop }bd level2 endnoload level2 not startnoload /setjob { 1 dict begin/JobName xdf currentdict end setuserparams }bd /setcopies { 1 dict begin/NumCopies xdf currentdict end setpagedevice }bd /devg[/DeviceGray]def /devr[/DeviceRGB]def /devc[/DeviceCMYK]def /setpagedevice where{pop/realstpgdev/setpagedevice ld}if /SC_topddict Z /SC_spdict Z /dopgdev { md/setpagedevice undef SC_topddict realstpgdev }bd /stpgdev { SC_topddict dup 3 -1 roll { SC_spdict 2 index known { SC_spdict 2 index get dup 3 -1 roll { put dup }forall pop put dup }{ put dup }ifelse }forall pop pop }bd /ststpgdev { md/setpagedevice/stpgdev load put /SC_topddict 0 dict store /SC_spdict 3 dict begin /InputAttributes 0 dict def /Policies 0 dict def /OutputAttributes 0 dict def currentdict end store }def /sfcl/sfcl2 ld /efcl/efcl2 ld level2 not endnoload /pm Z /mT Z /sD Z /realshowpage Z /initializepage { mT $m currentmatrix mTS concatmatrix pop /pm save store mT concat }bd /endp { pm restore }bd /adjRect { dup 2 mul 6 2 roll 4 index sub exch 5 -1 roll sub exch 4 2 roll 4 index add exch 5 -1 roll add exch 4 2 roll }bd /frame1up { gS mTS setmatrix QDframwid lw /setstrokeadjust where{pop T setstrokeadjust}if clippath pathbbox 2 index sub exch 3 index sub exch currentlinewidth framewidth mul adjRect numframes dup 0 lt{pop 0}if { 4 copy rS currentlinewidth framewidth mul 4 mul adjRect }repeat pop pop pop pop gR }bd /$c devr def /rectclip where { pop/rC/rectclip ld }{ /rC { np 4 2 roll :M 1 index 0 rl 0 exch rl neg 0 rl :K clip np }bd }ifelse /rectfill where { pop/rF/rectfill ld }{ /rF { gS np 4 2 roll :M 1 index 0 rl 0 exch rl neg 0 rl fill gR }bd }ifelse /rectstroke where { pop/rS/rectstroke ld }{ /rS { gS np 4 2 roll :M 1 index 0 rl 0 exch rl neg 0 rl :K stroke gR }bd }ifelse %%EndFile level3 startnoload %%BeginFile: lw8_safeclipL12-1.0 /rectclip where { pop/rCa/rectclip ld }{ /rCa { np 0 1 index length 4 idiv { 2 copy 4 getinterval aload pop 4 2 roll :M 1 index 0 rl 0 exch rl neg 0 rl :K 4 add }repeat clip np pop pop }bd }ifelse /savedstack Z /subsavedstack Z /execstring Z /saferCa { /execstring xs /odictstk countdictstack store /oopstk 0 store count 0 ne { savedstack 0 eq{ count 100 gt{count}{100}ifelse array/savedstack xs }{ count savedstack length gt { count array/savedstack xs }if }ifelse count savedstack 0 3 -1 roll getinterval astore/subsavedstack xs }{ /subsavedstack 0 store }ifelse execstring cvx stopped { $error/newerror F put newpath }if fcl subsavedstack 0 ne{ subsavedstack aload pop }if }bd %%EndFile level3 endnoload level3 not startnoload %%BeginFile: lw8_safeclipL3-1.0 /saferCa/pop ld %%EndFile level3 not endnoload %%BeginFile: lw8_level1_colorspace-2.0 /G/setgray ld /:F1/setgray ld /:F/setrgbcolor ld /:F4/setcmykcolor where { pop /setcmykcolor ld }{ { 3 { dup 3 -1 roll add dup 1 gt{pop 1}if 1 exch sub 4 1 roll }repeat pop setrgbcolor }bd }ifelse /:Fx { counttomark {0{G}0{:F}{:F4}} exch get exec pop }bd /$cs Z /:rg{devr :ss}bd /:sc{$cs :ss}bd /:dc { dup type/arraytype eq{0 get}if dup/DeviceCMYK eq { pop devc }{ /DeviceGray eq { devg }{ devr }ifelse }ifelse /$cs xdf }bd /:sgl{}def /:dr{}bd /:fCRD{pop}bd /:ckcs{}bd /:ss{/$c xdf}bd %%EndFile %%BeginFile: lw8_basic_text-2.0 /S/show ld /A{ 0.0 exch ashow }bd /R{ 0.0 exch 32 exch widthshow }bd /W{ 0.0 3 1 roll widthshow }bd /J{ 0.0 32 4 2 roll 0.0 exch awidthshow }bd /V{ 0.0 4 1 roll 0.0 exch awidthshow }bd /fcflg T def /fc{ fcflg{ vmstatus exch sub 50000 lt{ (%%[ Warning: Running out of memory ]%%\r)print flush/fcflg F store }if pop }if }bd /$f[1 0 0 -1 0 0]def /:ff{$f :mf}bd /MacEncoding StandardEncoding 256 array copy def MacEncoding dup 9/space put dup 39/quotesingle put 96/grave put /Adieresis/Aring/Ccedilla/Eacute/Ntilde/Odieresis/Udieresis/aacute /agrave/acircumflex/adieresis/atilde/aring/ccedilla/eacute/egrave /ecircumflex/edieresis/iacute/igrave/icircumflex/idieresis/ntilde/oacute /ograve/ocircumflex/odieresis/otilde/uacute/ugrave/ucircumflex/udieresis /dagger/degree/cent/sterling/section/bullet/paragraph/germandbls /registered/copyright/trademark/acute/dieresis/notequal/AE/Oslash /infinity/plusminus/lessequal/greaterequal/yen/mu/partialdiff/summation /product/pi/integral/ordfeminine/ordmasculine/Omega/ae/oslash /questiondown/exclamdown/logicalnot/radical/florin/approxequal/Delta/guillemotleft /guillemotright/ellipsis/space/Agrave/Atilde/Otilde/OE/oe /endash/emdash/quotedblleft/quotedblright/quoteleft/quoteright/divide/lozenge /ydieresis/Ydieresis/fraction/currency/guilsinglleft/guilsinglright/fi/fl /daggerdbl/periodcentered/quotesinglbase/quotedblbase/perthousand /Acircumflex/Ecircumflex/Aacute/Edieresis/Egrave/Iacute/Icircumflex/Idieresis/Igrave /Oacute/Ocircumflex/apple/Ograve/Uacute/Ucircumflex/Ugrave/dotlessi/circumflex/tilde /macron/breve/dotaccent/ring/cedilla/hungarumlaut/ogonek/caron MacEncoding 128 128 getinterval astore pop level2 startnoload /copyfontdict { findfont dup length dict begin { 1 index/FID ne{def}{pop pop}ifelse }forall }bd /$ckeyd md def /:skey { 1 index maxlength 2 index length sub ge { begin /$mkeys 20 dict def /$mkeys load end dup/$ckeyd xs }if 3 1 roll put }bd /fD1pass { 40 $ckeyd //md ne { pop 1 }if $ckeyd exch :skey }bd /:searchdict Z /:searchdict { exch 2 copy known { get }{ exch/$mkeys get :searchdict }ifelse }bd /lU { //md exch 2 copy known { get }{ exch /$mkeys get :searchdict }ifelse }bd level2 endnoload level2 not startnoload /copyfontdict { findfont dup length dict copy begin }bd /fD1pass/def ld /lU/load ld level2 not endnoload /fD Z /sf Z /scf Z /sf1pass { lU setfont }bd /scf1pass { exch lU exch scalefont fD }bd /scf2pass { scalefont fD }bd md/fontname known not{ /fontname/customfont def }if /Encoding Z /:mre { copyfontdict /Encoding MacEncoding def fontname currentdict end definefont :ff fD }bd /:bsr { copyfontdict /Encoding Encoding 256 array copy def Encoding dup }bd /pd{put dup}bd /:esr { pop pop fontname currentdict end definefont :ff fD }bd /ps Z /fz{/ps xs}bd /cF/currentfont ld /mbf { /makeblendedfont where { pop makeblendedfont /ABlend exch definefont }{ pop }ifelse fD }def %%EndFile /currentpacking where {pop sc_oldpacking setpacking}if end %%EndProlog %%BeginSetup md begin /fD/def ld/sf/setfont ld /scf/scf2pass ld sfcl{ %%BeginFeature: *ManualFeed False 1 dict dup /Policies 2 dict dup /PageSize 2 put dup /MediaType 0 put put setpagedevice 1 dict dup /ManualFeed false put setpagedevice %%EndFeature }efcl sfcl{ %%BeginFeature: *Smoothing None 1 dict dup /Policies 2 dict dup /PageSize 2 put dup /MediaType 0 put put setpagedevice 2 dict dup /PostRenderingEnhance true put dup /PostRenderingEnhanceDetails 2 dict dup /REValue 0 put dup /Type 8 put put setpagedevice %%EndFeature }efcl (Bronwyn)setjob sfcl{/currentdistillerparams where{pop 1 dict dup/Orientation 0 put setpagedevice}if }efcl /mT[1 0 0 -1 17 819]def /sD 16 dict def 600 level2{1 dict dup/WaitTimeout 4 -1 roll put setuserparams}{statusdict/waittimeout 3 -1 roll put}ifelse %%IncludeFont: TimesNewRomanPSMT /f47/TimesNewRomanPSMT :mre /f67 f47 10 scf %%IncludeFont: TimesNewRomanPS-BoldMT /f82/TimesNewRomanPS-BoldMT :mre /f104 f82 16 scf /f119 f82 10 scf /f134 f82 12 scf %%IncludeFont: Arial-BoldMT /f157/Arial-BoldMT :mre /f174 f157 10 scf %%IncludeFont: TimesNewRomanPS-ItalicMT /f189/TimesNewRomanPS-ItalicMT :mre /f212 f189 10 scf /Courier findfont[10 0 0 -10 0 0]:mf setfont %%EndSetup %%Page: 1 1 %%BeginPageSetup initializepage (Bronwyn; page: 1 of 1)setjob %%EndPageSetup gS 0 0 560 797 rC 0 0 560 792 rC 299 732 :M f67 sf (1)S 128 80 :M f104 sf (U)S 140 80 :M (sin)S 159 80 :M (g lan)S 192 80 :M (gu)S 209 80 :M (age mod)S 266 80 :M (els for gen)S 335 80 :M (eric en)S 380 80 :M (tity extraction)S 57 49 488 1 rF 57 99 488 1 rF 181 141 :M f119 sf (Ian H. Witten, Zane Bray, Malika Mahoui, W.J. Teahan)S 264 152 :M f67 sf (Computer Science)S 257 163 :M (University of Waikato)S 253 174 :M (Hamilton, New Zealand)S 256 185 :M (ihw@cs.waikato.ac.nz)S 153 229 :M f134 sf (Abstract)S 76 260 :M f67 sf 3.464 .346(This paper describes the use of statistical)J 76 271 :M 3.222 .322(language modeling techniques, such as are)J 76 282 :M .789 .079(commonly used for text compression, to extract)J 76 293 :M 2.343 .234(meaningful, low-level, information about the)J 76 304 :M .211 .021(location of semantic tokens, or \322entities,\323 in text.)J 76 315 :M .641 .064(We begin by marking up several different token)J 76 326 :M 3.525 .352(types in training documents\321for example,)J 76 337 :M 1.208 .121(people\325s names, dates and time periods, phone)J 76 348 :M 3.183 .318(numbers, and sums of money. We form a)J 76 359 :M .244 .024(language model for each token type and examine)J 76 370 :M .379 .038(how accurately it identifies new tokens. We then)J 76 381 :M 4.12 .412(apply a search algorithm to insert token)J 76 392 :M .201 .02(boundaries in a way that maximizes compression)J 76 403 :M .185 .018(of the entire test document. The technique can be)J 76 414 :M .572 .057(applied to hierarchically-defined tokens, leading)J 76 425 :M .628 .063(to a kind of \322soft parsing\323 that will, we believe,)J 76 436 :M 2.503 .25(be able to identify structured items such as)J 76 447 :M .391 .039(references and tables in html or plain text, based)J 76 458 :M .248 .025(on nothing more than a few marked-up examples)J 76 469 :M (in training documents.)S 58 498 :M f119 sf (1.)S f174 sf ( )S 72 498 :M f119 sf (INTRODUCTION)S 58 515 :M f67 sf .245 .025(Text mining is about looking for patterns in text, and may)J 58 526 :M 1.758 .176(be defined as the process of analyzing text to extract)J 58 537 :M 3.531 .353(information that is useful for particular purposes.)J 58 548 :M .72 .072(Compared with the kind of data stored in databases, text)J 58 559 :M 1.65 .165(is unstructured, amorphous, and difficult to deal with.)J 58 570 :M .432 .043(Nevertheless, in modern Western culture, text is the most)J 58 581 :M .585 .058(common vehicle for the formal exchange of information.)J 58 592 :M .289 .029(The motivation for trying to extract information from it is)J 58 603 :M (compelling\321even if success is only partial.)S 58 620 :M 2.172 .217(Text mining is possible because you do not have to)J 58 631 :M .096 .01(understand text in order to extract useful information from)J 58 642 :M .708 .071(it. Here are four examples. First, if only names could be)J 58 653 :M .892 .089(identified, links could be inserted automatically to other)J 58 664 :M 2.891 .289(places that mention the same name\321links that are)J 58 675 :M .601 .06(\322dynamically evaluated\323 by calling upon a search engine)J 310 229 :M 2.594 .259(to bind them at click time. Second, actions can be)J 310 240 :M 2.567 .257(associated with different types of data, using either)J 310 251 :M .748 .075(explicit programming or programming-by-demonstration)J 310 262 :M .563 .056(techniques. A day/time specification appearing anywhere)J 310 273 :M .465 .047(within one\325s email could be associated with diary actions)J 310 284 :M 2.076 .208(such as updating a personal organizer or creating an)J 310 295 :M .089 .009(automatic reminder, and each mention of a day/time in the)J 310 306 :M .619 .062(text could raise a popup menu of calendar-based actions.)J 310 317 :M 1.635 .163(Third, text could be mined for data in tabular format,)J 310 328 :M 1.303 .13(allowing databases to be created from formatted tables)J 310 339 :M .682 .068(such as stock-market information on Web pages. Fourth,)J 310 350 :M 1.557 .156(an agent could monitor incoming newswire stories for)J 310 361 :M 2.419 .242(company names and collect documents that mention)J 310 372 :M (them\321an automated press clipping service.)S 310 389 :M 1.429 .143(In all these examples, the key problem is to recognize)J 310 400 :M 1.47 .147(different types of target fragments, which we will call)J 310 411 :M 1.635 .164(tokens or \322entities\323. This is really a kind of language)J 310 422 :M .415 .042(recognition problem: we have a text made up of different)J 310 433 :M .091 .009(sublanguages \(for personal names, company names, dates,)J 310 444 :M 1.595 .16(table entries, and so on\) and seek to determine which)J 310 455 :M (parts are expressed in which language.)S 310 472 :M .192 .019(The information extraction research community \(of which)J 310 483 :M .594 .059(we were, until recently, unaware\) has studied these tasks)J 310 494 :M 1.631 .163(and reported results at annual Message Understanding)J 310 505 :M 1.018 .102(Conferences \(MUC\). For example, \322named entities\323 are)J 310 516 :M 2.795 .28(defined as proper names and quantities of interest,)J 310 527 :M .883 .088(including personal, organization, and location names, as)J 310 538 :M .689 .069(well as dates, times, percentages, and monetary amounts)J 310 549 :M (\(Chinchor, 1999\).)S 310 566 :M 3.12 .312(The standard approach to this problem is manual:)J 310 577 :M 2.816 .282(tokenizers and grammars are hand-designed for the)J 310 588 :M 3.113 .311(particular data being extracted. Looking at current)J 310 599 :M 2.306 .231(commercial state-of-the-art text mining software, for)J 310 610 :M .54 .054(example, IBM\325s )J f212 sf .519 .052(Intelligent Miner for Text)J 484 610 :M f67 sf .575 .058( \(Tkach, 1997\))J 310 621 :M .813 .081(uses specific recognition modules carefully programmed)J 310 632 :M .581 .058(for the different data types, while Apple\325s )J 487 632 :M f212 sf .407 .041(data detectors)J 310 643 :M f67 sf 1.306 .131(\(Nardi )J 341 643 :M f212 sf .991 .099(et al)J f67 sf 1.369 .137(., 1998\) uses language grammars. The )J 528 643 :M f212 sf (Text)S 310 654 :M 1.337 .134(Tokenization Tool)J 387 654 :M f67 sf 2.429 .243( of Grover )J f212 sf 2.819 .282(et al)J 462 654 :M f67 sf 2.173 .217(. \(1999\) is another)J 310 665 :M .62 .062(example, and a demonstration version is available on the)J 310 676 :M 2.641 .264(Web. The challenge for machine learning is to use)J endp showpage %%Trailer end %%EOF