Commit 6f78640e authored by Loïc Grobol's avatar Loïc Grobol
Browse files

improve stats

parent 38ca324d
Pipeline #8626 passed with stages
in 1 minute and 56 seconds
......@@ -30,13 +30,18 @@ def main(corpus_path: pathlib.Path):
trees: Dict[str, Tuple[str, conllu.TokenList]] = dict()
print("Per file:")
print("file\ttokens\ttrees")
total_tokens = 0
total_trees = 0
for f in corpus_path.glob("*.conllu"):
with open(f) as in_stream:
file_trees = list(conllu.parse_incr(in_stream))
trees.update((t.metadata["sent_id"], (f.stem, t)) for t in file_trees)
n_trees = len(file_trees)
n_tokens = sum(len(t) for t in file_trees)
n_trees = len(file_trees)
total_tokens += n_tokens
total_trees += n_trees
print(f"{f.stem}\t{n_tokens}\t{n_trees}")
print(f"total\t{total_tokens}\t{total_trees}")
print()
print("Tokens repartition:")
with open(corpus_path / "split.json") as in_stream:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment