diff --git a/africat/aa_create_dataset.py b/africat/aa_create_dataset.py index a93045b..9564244 100755 --- a/africat/aa_create_dataset.py +++ b/africat/aa_create_dataset.py @@ -105,6 +105,9 @@ def parse_and_extract(input_dir, verbose): text = list() lang = "" try: + for t in doc.find('./publisher_headline'): + if t.text is not None: + text.append(t.text) for p in doc.find('./body'): if p.text is not None: text.append(p.text) @@ -152,8 +155,8 @@ def scrub_data(articles, verbose): data['content'] = data.content.parallel_apply(lambda x: x.translate(remove_digits)) # Remove extra spaces - data['content']=data.content.parallel_apply(lambda x: x.strip()) - data['content']=data.content.parallel_apply(lambda x: re.sub(" +", " ", x)) + data['content'] = data.content.parallel_apply(lambda x: x.strip()) + data['content'] = data.content.parallel_apply(lambda x: re.sub(" +", " ", x)) # TODO: lemmas? See spaCy @@ -184,6 +187,7 @@ def main(): articles = parse_and_extract(args.input, args.verbose) data = scrub_data(articles, args.verbose) + #print(data) write_csv(data, args.output)