Linguistische Komplexität

In [1]:
import polars as pl
from textstat import textstat
from dakoda.corpus import DakodaCorpus
from IPython.display import display
import itables
itables.init_notebook_mode(all_interactive=True)

textstat.set_lang("de")
In [2]:
corpus = DakodaCorpus("data/merlin")

# Start with empty DataFrame
df = pl.DataFrame({
    'filename': [],
    'readability': [],
})
data = []
for doc in corpus: 
    meta = doc.meta
    data.append({
        'filename': meta.text.text_file,
        'readability': textstat.flesch_reading_ease(doc.text),
        'cefr': meta.text.proficiency.text_proficiency_cefrMax
        }
    )

df = pl.DataFrame(data)
display(df.head())
Loading ITables v2.5.2 from the init_notebook_mode cell... (need help?)
In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

sns.violinplot(data=df, y='readability')
plt.title('Distribution of Readability Scores')
plt.xlabel('Readability Score')
plt.show()
In [4]:
order = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
sns.boxplot(x='cefr', y='readability', data=df, order=order)
Out[4]:
<Axes: xlabel='cefr', ylabel='readability'>