Add original title to story text
This commit is contained in:
parent
61d32c5286
commit
bedf82d8a1
@ -105,6 +105,9 @@ def parse_and_extract(input_dir, verbose):
|
||||
text = list()
|
||||
lang = ""
|
||||
try:
|
||||
for t in doc.find('./publisher_headline'):
|
||||
if t.text is not None:
|
||||
text.append(t.text)
|
||||
for p in doc.find('./body'):
|
||||
if p.text is not None:
|
||||
text.append(p.text)
|
||||
@ -152,8 +155,8 @@ def scrub_data(articles, verbose):
|
||||
data['content'] = data.content.parallel_apply(lambda x: x.translate(remove_digits))
|
||||
|
||||
# Remove extra spaces
|
||||
data['content']=data.content.parallel_apply(lambda x: x.strip())
|
||||
data['content']=data.content.parallel_apply(lambda x: re.sub(" +", " ", x))
|
||||
data['content'] = data.content.parallel_apply(lambda x: x.strip())
|
||||
data['content'] = data.content.parallel_apply(lambda x: re.sub(" +", " ", x))
|
||||
|
||||
# TODO: lemmas? See spaCy
|
||||
|
||||
@ -184,6 +187,7 @@ def main():
|
||||
articles = parse_and_extract(args.input, args.verbose)
|
||||
|
||||
data = scrub_data(articles, args.verbose)
|
||||
#print(data)
|
||||
|
||||
write_csv(data, args.output)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user