Add original title to story text
This commit is contained in:
parent
61d32c5286
commit
bedf82d8a1
@ -105,6 +105,9 @@ def parse_and_extract(input_dir, verbose):
|
|||||||
text = list()
|
text = list()
|
||||||
lang = ""
|
lang = ""
|
||||||
try:
|
try:
|
||||||
|
for t in doc.find('./publisher_headline'):
|
||||||
|
if t.text is not None:
|
||||||
|
text.append(t.text)
|
||||||
for p in doc.find('./body'):
|
for p in doc.find('./body'):
|
||||||
if p.text is not None:
|
if p.text is not None:
|
||||||
text.append(p.text)
|
text.append(p.text)
|
||||||
@ -152,8 +155,8 @@ def scrub_data(articles, verbose):
|
|||||||
data['content'] = data.content.parallel_apply(lambda x: x.translate(remove_digits))
|
data['content'] = data.content.parallel_apply(lambda x: x.translate(remove_digits))
|
||||||
|
|
||||||
# Remove extra spaces
|
# Remove extra spaces
|
||||||
data['content']=data.content.parallel_apply(lambda x: x.strip())
|
data['content'] = data.content.parallel_apply(lambda x: x.strip())
|
||||||
data['content']=data.content.parallel_apply(lambda x: re.sub(" +", " ", x))
|
data['content'] = data.content.parallel_apply(lambda x: re.sub(" +", " ", x))
|
||||||
|
|
||||||
# TODO: lemmas? See spaCy
|
# TODO: lemmas? See spaCy
|
||||||
|
|
||||||
@ -184,6 +187,7 @@ def main():
|
|||||||
articles = parse_and_extract(args.input, args.verbose)
|
articles = parse_and_extract(args.input, args.verbose)
|
||||||
|
|
||||||
data = scrub_data(articles, args.verbose)
|
data = scrub_data(articles, args.verbose)
|
||||||
|
#print(data)
|
||||||
|
|
||||||
write_csv(data, args.output)
|
write_csv(data, args.output)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user