Here's a quick 14-line python script to do the job. It takes all the html files in the ./docs directory and writes them out as clean text to the ./output directory.
Note the lxml dependency!
from lxml.html import clean
import glob, re
cleaner = clean.Cleaner( style=True, scripts=True, comments=True, safe_attrs_only=True )
filenames = glob.glob('docs/*')
for f in filenames:
print '='*80
text = file(f,'r').read()
text = cleaner.clean_html( text ) #Remove scripts, styles, etc.
text = re.sub('<.*?>', '', text ) #Remove html tags
text = re.sub('\s+', ' ', text ) #Remove whitespace
print text
file( 'output/'+f.split('/')[-1].split('.')[0]+'.txt', 'w').write( text )
No comments:
Post a Comment