|
- import re
-
- contractions = {
- "aint": "ain't",
- "arent": "aren't",
- "cant": "can't",
- "couldve": "could've",
- "couldnt": "couldn't",
- "couldn'tve": "couldn't've",
- "couldnt've": "couldn't've",
- "didnt": "didn't",
- "doesnt": "doesn't",
- "dont": "don't",
- "hadnt": "hadn't",
- "hadnt've": "hadn't've",
- "hadn'tve": "hadn't've",
- "hasnt": "hasn't",
- "havent": "haven't",
- "hed": "he'd",
- "hed've": "he'd've",
- "he'dve": "he'd've",
- "hes": "he's",
- "howd": "how'd",
- "howll": "how'll",
- "hows": "how's",
- "Id've": "I'd've",
- "I'dve": "I'd've",
- "Im": "I'm",
- "Ive": "I've",
- "isnt": "isn't",
- "itd": "it'd",
- "itd've": "it'd've",
- "it'dve": "it'd've",
- "itll": "it'll",
- "let's": "let's",
- "maam": "ma'am",
- "mightnt": "mightn't",
- "mightnt've": "mightn't've",
- "mightn'tve": "mightn't've",
- "mightve": "might've",
- "mustnt": "mustn't",
- "mustve": "must've",
- "neednt": "needn't",
- "notve": "not've",
- "oclock": "o'clock",
- "oughtnt": "oughtn't",
- "ow's'at": "'ow's'at",
- "'ows'at": "'ow's'at",
- "'ow'sat": "'ow's'at",
- "shant": "shan't",
- "shed've": "she'd've",
- "she'dve": "she'd've",
- "she's": "she's",
- "shouldve": "should've",
- "shouldnt": "shouldn't",
- "shouldnt've": "shouldn't've",
- "shouldn'tve": "shouldn't've",
- "somebody'd": "somebodyd",
- "somebodyd've": "somebody'd've",
- "somebody'dve": "somebody'd've",
- "somebodyll": "somebody'll",
- "somebodys": "somebody's",
- "someoned": "someone'd",
- "someoned've": "someone'd've",
- "someone'dve": "someone'd've",
- "someonell": "someone'll",
- "someones": "someone's",
- "somethingd": "something'd",
- "somethingd've": "something'd've",
- "something'dve": "something'd've",
- "somethingll": "something'll",
- "thats": "that's",
- "thered": "there'd",
- "thered've": "there'd've",
- "there'dve": "there'd've",
- "therere": "there're",
- "theres": "there's",
- "theyd": "they'd",
- "theyd've": "they'd've",
- "they'dve": "they'd've",
- "theyll": "they'll",
- "theyre": "they're",
- "theyve": "they've",
- "twas": "'twas",
- "wasnt": "wasn't",
- "wed've": "we'd've",
- "we'dve": "we'd've",
- "weve": "we've",
- "werent": "weren't",
- "whatll": "what'll",
- "whatre": "what're",
- "whats": "what's",
- "whatve": "what've",
- "whens": "when's",
- "whered": "where'd",
- "wheres": "where's",
- "whereve": "where've",
- "whod": "who'd",
- "whod've": "who'd've",
- "who'dve": "who'd've",
- "wholl": "who'll",
- "whos": "who's",
- "whove": "who've",
- "whyll": "why'll",
- "whyre": "why're",
- "whys": "why's",
- "wont": "won't",
- "wouldve": "would've",
- "wouldnt": "wouldn't",
- "wouldnt've": "wouldn't've",
- "wouldn'tve": "wouldn't've",
- "yall": "y'all",
- "yall'll": "y'all'll",
- "y'allll": "y'all'll",
- "yall'd've": "y'all'd've",
- "y'alld've": "y'all'd've",
- "y'all'dve": "y'all'd've",
- "youd": "you'd",
- "youd've": "you'd've",
- "you'dve": "you'd've",
- "youll": "you'll",
- "youre": "you're",
- "youve": "you've",
- }
-
- manual_map = {
- "none": "0",
- "zero": "0",
- "one": "1",
- "two": "2",
- "three": "3",
- "four": "4",
- "five": "5",
- "six": "6",
- "seven": "7",
- "eight": "8",
- "nine": "9",
- "ten": "10",
- }
- articles = ["a", "an", "the"]
- period_strip = re.compile("(?!<=\d)(\.)(?!\d)")
- comma_strip = re.compile("(\d)(\,)(\d)")
- punct = [
- ";",
- r"/",
- "[",
- "]",
- '"',
- "{",
- "}",
- "(",
- ")",
- "=",
- "+",
- "\\",
- "_",
- "-",
- ">",
- "<",
- "@",
- "`",
- ",",
- "?",
- "!",
- ]
-
-
- def normalize_word(token):
- _token = token
- for p in punct:
- if (p + " " in token or " " + p in token) or (
- re.search(comma_strip, token) != None
- ):
- _token = _token.replace(p, "")
- else:
- _token = _token.replace(p, " ")
- token = period_strip.sub("", _token, re.UNICODE)
-
- _token = []
- temp = token.lower().split()
- for word in temp:
- word = manual_map.setdefault(word, word)
- if word not in articles:
- _token.append(word)
- for i, word in enumerate(_token):
- if word in contractions:
- _token[i] = contractions[word]
- token = " ".join(_token)
- token = token.replace(",", "")
- return token
|