Nltk simple introduction and data cleaning

Time:2021-5-9

Nltk is an English word segmentation tool with a long history

#Import word segmentation module
from nltk.tokenize import word_tokenize
from nltk.text import Text

input='''
There were a sensitivity and a beauty to her that have nothing to do with looks. She was one to be listened to, whose words were so easy to take to heart.
'''
tokens=word_tokenize(input)
#Print the first 5 words
print(tokens[:5])
#There and there should be the same word
tokens=[w.lower() for w in tokens]

#Create a text object
t=Text(tokens)

#Count the number of times a word appears
t.count('beauty')

#Calculate where a word appears

t.index('beauty')

#Draw a picture of the top 8 words that appear most frequently
#Matplotlib PIP install Matplotlib is required
t.plot(8)
['There', 'were', 'a', 'sensitivity', 'and']


Nltk simple introduction and data cleaning

Stop words

from nltk.corpus import stopwords

#Print out all the stop words in the supported language. We use English

stopwords.fileids()
['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish',
 'turkish']



#Print all stop words
stopwords.raw('english').replace('\n',' ')
"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't "



#Filter stop words

tokens=set(tokens)

filtered=[w for w in tokens if(w not in stopwords.words('english'))]

print(filtered)
['nothing', 'sensitivity', ',', 'one', 'beauty', 'words', 'heart', 'looks', 'take', 'whose', '.', 'listened', 'easy']

Part of speech tagging

#The first time you need to download the corresponding component nltk. Download ()
from nltk import pos_tag
pos_tag(filtered)
[('nothing', 'NN'),
 ('sensitivity', 'NN'),
 (',', ','),
 ('one', 'CD'),
 ('beauty', 'NN'),
 ('words', 'NNS'),
 ('heart', 'NN'),
 ('looks', 'VBZ'),
 ('take', 'VB'),
 ('whose', 'WP$'),
 ('.', '.'),
 ('listened', 'VBN'),
 ('easy', 'JJ')]


POS Tag Reference
CC Coordinate conjunction
CD Cardinal words
DT qualifier
EX Existential words
FW Loanwords
IN Preposition or subordinate conjunction
JJ adjective
JJR Comparative adjectives
JJS The superlative adjective
LS List item tag
MD Modal verbs
NN Noun singular
NNS Plural noun
NNP Proper noun
PDT Prepositional determiner
POS Possessive ending
PRP Personal pronouns
PRP$ possessive pronoun
RB adverb
RBR comparative of adverbs
RBS Adverb superlative
RP Essay
UH Interjection
VB Verb prototype
VBD Verb past tense
VBG Gerund or present participle
VBN past participle
VBP The present tense of non third person singular
VBZ The present tense of the third person singular
WDT Determiners beginning with wh

Block

from nltk.chunk import RegexpParser
sentence = [('the','DT'),('little','JJ'),('yellow','JJ'),('dog','NN'),('died','VBD')]
grammer = "MY_NP: {<DT>?<JJ>*<NN>}"
CP = nltk. Regexpparser (grammer) # generating rules
Result = CP. parse (sense) # block
print(result)

Result. Draw() # call the Matplotlib library to draw
(S (MY_NP the/DT little/JJ yellow/JJ dog/NN) died/VBD)



An exception has occurred, use %tb to see the full traceback.


SystemExit: 0


Named entity recognition

#The first time you need to download the corresponding component nltk. Download ()
from nltk import ne_chunk

input = "Edison went to Tsinghua University today."

print(ne_chunk(pos_tag(word_tokenize(input))))
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
(S
  (PERSON Edison/NNP)
  went/VBD
  to/TO
  (ORGANIZATION Tsinghua/NNP University/NNP)
  today/NN
  ./.)




Data cleaning

import re
from nltk.corpus import stopwords
#Input data
s = '    RT @Amila #Test\nTom\'s newly listed Co  &amp; Mary\'s unlisted     Group to supply tech for nlTK.\nh $TSLA $AAPL https:// t.co/x34afsfQsh'

#Remove HTML tags
s=re.sub(r'&\w*;|@\w*|#\w*','',s)

#Remove some value symbols
s=re.sub(r'$\w*','',s)

#Remove hyperlinks
s=re.sub(r'https?:\/\/.*\/\w*','',s)

#Remove some proper nouns as word boundaries
s=re.sub(r'\b\w{1,2}\b','',s)

#Remove extra space
s=re.sub(r'\s\s+','',s)

#Participle
tokens=word_tokenize(s)

#Remove stop words
tokens=[w for w in tokens if(w not in stopwords.words('english'))]

#The final result
print(' '.join(tokens))
Tom ' newly listedMary ' unlistedGroupsupply tech nlTK .

Nltk simple introduction and data cleaning