Add original title to story text

This commit is contained in:
Timothy Allen 2023-12-30 12:29:56 +02:00
parent 61d32c5286
commit bedf82d8a1
1 changed files with 6 additions and 2 deletions

View File

@ -105,6 +105,9 @@ def parse_and_extract(input_dir, verbose):
text = list()
lang = ""
try:
for t in doc.find('./publisher_headline'):
if t.text is not None:
text.append(t.text)
for p in doc.find('./body'):
if p.text is not None:
text.append(p.text)
@ -152,8 +155,8 @@ def scrub_data(articles, verbose):
data['content'] = data.content.parallel_apply(lambda x: x.translate(remove_digits))
# Remove extra spaces
data['content']=data.content.parallel_apply(lambda x: x.strip())
data['content']=data.content.parallel_apply(lambda x: re.sub(" +", " ", x))
data['content'] = data.content.parallel_apply(lambda x: x.strip())
data['content'] = data.content.parallel_apply(lambda x: re.sub(" +", " ", x))
# TODO: lemmas? See spaCy
@ -184,6 +187,7 @@ def main():
articles = parse_and_extract(args.input, args.verbose)
data = scrub_data(articles, args.verbose)
#print(data)
write_csv(data, args.output)