Create some ready to use notebooks

You asked how we could query the database as a starting point using Voici.

I leave it to you to decide on the format of the file that will mirror the database. I will begin with queries using 1 table, but some others are more complex and require querying multiple tables to concatenate pieces of information.

We can look for a word:

is it in the database? There is the get_word_id function:

def get_word_id(word, book=None, create=True, activate=False, con=False, cursor=False):
    """
    Used to get word ids and create new words if they don't exist.
    get the id of an existing word, i.e. '牛'
        book: for new word entries to be created, give the title of the source where the word
              was [first] encountered, i. e. '爾雅'. Optimally, this is the earliest attestation
        create: True – unknown words will be added to the DB
                terms including certain ter,s
        activate: True – deactivated words will be reactivated
        con, cursor: db connection from main method if applicable
    """
    if not con:
        con, cursor = database_connect()
        internal = True
    else:
        internal = False
    select = "select id, inactive from the_words where cleanword = '%s' order by id limit 1" % word
    cursor.execute(select)
    try:
        word_id, inactive = cursor.fetchone()
    except:
        word_id, inactive = None, 0
    if word_id is not None and inactive == 0:
        return word_id
    elif inactive == 0:
        # do not create X 之 Y expressions as words or empty words
        if re.compile('(\p{IsHan}{1,4}[之皆而如在所有曰]\p{IsHan}{1,4}|^[前後左右所其之在].+|.+[也者之曰]$)').match(word) or word == '' or word is None or word is False:
            if internal:
                con.close()
            return False
        elif create:
            cursor.execute("select max(id)+1 from the_words")
            next_id = cursor.fetchone()[0]
            book_id = get_book_id(book) if book is not None else 156656
            if book_id:
                cursor.execute("insert into the_words (id, cleanword, nakedword, earliest_evidence_book_id) values (%s, '%s', '%s', %s)" % (next_id, word, word, book_id))
                con.commit()
                cursor.execute("select cleanword from the_words where id = %s" % (next_id))
                inserted_word = cursor.fetchone()[0]
                if inserted_word == word: # need to check if that actually worked, because of encoding issues with extended chars like 𦺋𧃒
                    # word_ids[word] = next_id
                    print("✳️ Created new word entry 【%s】, ID %s." % (word, next_id))
                    if internal: con.close()
                    return(next_id)
                else:
                    cursor.execute("delete from the_words where id = %s" % (next_id))
                    con.commit()
                    if internal: con.close()
                    print("❌ Could not create a word entry【%s】(unsupported chars?)" % word)
                    return False
            else:
                print("❌ Could not create a word entry【%s】(Failed to retrieve source book id for %s)" % (word, book))
        else:
            print("❌ No entry for %s." % word)
            return False
    elif inactive == 1 and activate == True:
        cursor.execute("update the_words set inactive = 0 where id = %s" % word_id)
        con.commit()
        print("Word entry %s for %s has been reactivated." % (word_id, word))
        if internal: con.close()
    else:
        print("Word entry %s for %s is deactivated." % (word_id, word))
        if internal: con.close()
        return False

However, it's not that interesting to get the word_id, its more interesting to get information such as:

pínɡ 苹 attested in 戰國 · · 詩經 -1100 -700 -900.0 詩經 Shi Jing 戰國 after entering the word 苹.

does the word belong to a specific category? This is very useful. To this aim, we used the following function:

def get_categories_by_words(wordlist=False):
    """
    Input wordlist: a list of strings/words, i. e. ['妳', '我', '她']
    Output: a dataframe with word data including their semantic categories.
        Words can appear n times in the df when they have multiple categories.
    will return all available data when there is no wordlist.
    """
    con, cursor = database_connect()
    if wordlist:
        implode = "'" + "','".join(wordlist) + "'"
    query = """
        select wic.id as relation_id, w.id, w.cleanword, c.id as category_id, c.name, b.clearbook as category_source, b.startyear, b.endyear, b2.clearbook as word_earliest_attestation, b2.startyear as word_att_startyear, b2.endyear as word_att_endyear, wic.categorized_by, wic.comment, wic.inactive from the_words w
        left outer join words_in_categories wic on w.id = wic.word_id
        left join the_books b on wic.category_source = b.id
        left join the_books b2 on coalesce(w.earliest_evidence_book_id,book_id) = b2.id
        left join semantic_categories c on wic.category_id = c.id where """
    if wordlist:
        query +=  """ w.cleanword in (""" + implode + """) and """
    query += "w.inactive = 0 and wic.inactive = 0 order by w.cleanword, c.name"
    data = pd.read_sql(query, con, index_col='relation_id')
    return data

Expected result (in this case, the word is part of two categories, but since one is a subcategory of another, the flora_plant_wormwood might be enough):

id	cleanword	category_id	name	category_source	startyear	endyear	word_earliest_attestation	word_att_startyear	word_att_endyear	categorized_by	comment	inactive
relation_id
3548	253237	苹	17	flora_plant	爾雅	-300	-200	詩經	-1100	-700	TS	parsed	0
4136	253237	苹	58	flora_plant_wormwood	CHI-KNOW-PO	2022	2022	詩經	-1100

Then we could ask/automatically display all the words which belong to the same categories: We used:

def get_words_by_category(category_name, sub=True, source=None):
    """
    Input category_name, i.e. 'fauna_beast'
          sub: True: will also return words from sub(sub-)categories of the given category_name
          sub: False: will strictly return the words in given category
          source: leave empty to receive words regardless of the source of the categorization
                  use to restrict results to specific source, i.e. '爾雅'
    Output: df of words in given category
    """
    con, cursor = database_connect()
    restrict = category_name
    if sub == True: restrict += '%'
    query = """
        select wic.id as relation_id, w.id, w.cleanword, c.name, b.clearbook as category_source, b.startyear, b.endyear, b2.clearbook as word_earliest_attestation, b2.startyear as word_att_startyear, b2.endyear as word_att_endyear, wic.categorized_by, wic.comment, wic.inactive from words_in_categories wic
        left join the_words w on wic.word_id = w.id
        left join the_books b on wic.category_source = b.id
        left join the_books b2 on coalesce(w.earliest_evidence_book_id,w.book_id) = b2.id
        left join semantic_categories c on wic.category_id = c.id
        where c.name like '%s' and w.inactive = 0 and wic.inactive = 0""" % (restrict)
    if source:
        query += " and b.clearbook = '%s' " % (source)
    query += " order by c.name"
    # print(query)
    data = pd.read_sql(query, con, index_col='relation_id')
    return data

for does it belong to a list of "synonyms"? Again, what we used:

def retrieve_synonyms(wordlist=None, inverse=True, source=False, location=False):
    """
    get synonym information from the DB.
    Input:
        * wordlist: a list of words that need synonyms retrieved.
            retrieve_synonyms(wordlist = [term1, term2]) will return a list of synonyms to the terms in the list
            retrieve_synonyms(wordlist = []) will return an empty list
            retrieve_synonyms(wordlist = False) will return all the words from the database
        * inverse: set to False to exclude "vice versa" entries where given word is the synonym
        * source: restrict to synonyms from a specific source
        * location: restrict to local synonyms from a specific location
    Output:
        * df of the results
    """
    con, cursor = database_connect()
    implode = "'" + "','".join(wordlist) + "'" if wordlist else ""
    query = """
        select s.node_id, w2.id, w2.cleanword, b2.clearbook word_earliest_attestation, b2.startyear word_att_startyear, b2.endyear word_att_endyear, w1.id as syn_id, w1.cleanword as syn, b1.clearbook syn_earliest_attestation, b1.startyear syn_att_startyear, b1.endyear syn_att_endyear, l.name_chn as local, b.clearbook relation_source, b.startyear, b.endyear, s.comment from synonyms s
            left join the_words w1 on s.word_id_1 = w1.id
            left join the_words w2 on s.word_id_2 = w2.id
            left join the_books b on s.book_id = b.id
            left join the_books b1 on coalesce(w1.earliest_evidence_book_id,w1.book_id) = b1.id
            left join the_books b2 on coalesce(w2.earliest_evidence_book_id,w2.book_id) = b2.id
            left join the_locations l on s.location_id = l.id
            where 1 = 1 and w2.inactive = 0 and s.inactive = 0 """
    if wordlist :
        query += "and w1.cleanword in (" + implode + ")"
    elif wordlist != [] and wordlist is None :
        query += ""
    else:
        query += "and w1.cleanword in ('')"
    if source: query += "and b.clearbook = '%s'" % source
    if inverse:
        query += """
        union select s.node_id, w1.id, w1.cleanword, b1.clearbook word_earliest_attestation, b1.startyear word_att_startyear, b1.endyear word_att_endyear, w2.id as syn_id, w2.cleanword as syn, b2.clearbook syn_earliest_attestation, b2.startyear syn_att_startyear, b2.endyear syn_att_endyear, l.name_chn as local, b.clearbook relation_source, b.startyear, b.endyear, s.comment from synonyms s
            left join the_words w1 on s.word_id_1 = w1.id
            left join the_words w2 on s.word_id_2 = w2.id
            left join the_books b on s.book_id = b.id
            left join the_books b1 on coalesce(w1.earliest_evidence_book_id,w1.book_id) = b1.id
            left join the_books b2 on coalesce(w2.earliest_evidence_book_id,w2.book_id) = b2.id
            left join the_locations l on s.location_id = l.id
            where 1 = 1 and w1.inactive = 0 and s.inactive = 0 """
    if wordlist :
        query += " and w2.cleanword in (" + implode + ")"
    elif wordlist != [] and wordlist is None :
        query += ""
    else:
        query += "and w2.cleanword in ('')"
    if location: query += " and l.name_chn = '%s'" % location
    if source: query += " and b.clearbook = '%s'" % source
    data = pd.read_sql(query, con, index_col='node_id')
    return data

We could also start by choosing a category, and ask for all the words that belong to this category.

All the code that I quote was made available to the CHI-KNOW-PO project by Tilman Schalmey.

You can pick whatever you find interesting! One query is a good start.

People will be nice to show as well, but it's more complex.

Edited Apr 18, 2024 by Régis Witz