import polars as pl
from textstat import textstat
from dakoda.corpus import DakodaCorpus
from IPython.display import display
import itables
itables.init_notebook_mode(all_interactive=True)
textstat.set_lang("de")
corpus = DakodaCorpus("data/merlin")
# Start with empty DataFrame
df = pl.DataFrame({
'filename': [],
'readability': [],
})
data = []
for doc in corpus:
meta = doc.meta
data.append({
'filename': meta.text.text_file,
'readability': textstat.flesch_reading_ease(doc.text),
'cefr': meta.text.proficiency.text_proficiency_cefrMax
}
)
df = pl.DataFrame(data)
display(df.head())
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
sns.violinplot(data=df, y='readability')
plt.title('Distribution of Readability Scores')
plt.xlabel('Readability Score')
plt.show()
order = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
sns.boxplot(x='cefr', y='readability', data=df, order=order)