Source code for hal.strings.models
# -*- coding: utf-8 -*-
"""String models"""
import unicodedata
from pyparsing import Literal, Word, nums, Combine, Optional, delimitedList, \
alphas, oneOf, Suppress
[docs]class String:
"""Models string"""
def __init__(self, string):
self.string = str(string)
[docs] def remove_escapes(self):
"""Removes everything except number and letters from string
:return: All numbers and letters in string
"""
chars = []
i = 0
while i < len(self.string):
char = self.string[i]
if char == "\\":
i += 1
else:
chars.append(char)
i += 1
return "".join(chars)
[docs] def remove_non_ascii(self):
"""Removes non-ansi chars from text
:return: input except non-ansi chars
"""
return ''.join(c for c in self.string if ord(c) < 128)
[docs] def convert_accents(self):
"""Removes accents from text
:return: input with converted accents chars
"""
nkfd_form = unicodedata.normalize('NFKD', self.string)
return "".join([
char
for char in nkfd_form
if not unicodedata.combining(char)
])
[docs] def remove_control_chars(self):
"""Removes controls chars from text
:return: input except controls chars
"""
esc_key = Literal('\x1b')
integer = Word(nums)
escape_seq = Combine(
esc_key + '[' + Optional(delimitedList(integer, ';')) +
oneOf(list(alphas)))
return Suppress(escape_seq).transformString(self.string)
[docs] def strip_bad_html(self):
"""Strips string of all HTML elements
:return: Given string with raw HTML elements removed
"""
out = self.string
while not String(out).is_well_formatted():
out = out.replace(":", "") \
.replace("\\'", "\'") \
.replace("\\n", "") \
.replace("\\r", "") \
.replace("\\t", "") \
.replace("\n", "") \
.replace("\r", "") \
.replace("\t", "") \
.replace(" ", " ") \
.strip()
return str(out)
[docs] def remove_all(self, token):
"""Removes all occurrences of token
:param token: string to remove
:return: input without token
"""
out = self.string.replace(" ", token) # replace tokens
while out.find(token + token) >= 0: # while there are tokens
out = out.replace(token + token, token)
return out