Create some ready to use notebooks
You asked how we could query the database as a starting point using Voici.
I leave it to you to decide on the format of the file that will mirror the database. I will begin with queries using 1 table, but some others are more complex and require querying multiple tables to concatenate pieces of information.
We can look for a word:
- is it in the database? There is the get_word_id function:
def get_word_id(word, book=None, create=True, activate=False, con=False, cursor=False):
"""
Used to get word ids and create new words if they don't exist.
get the id of an existing word, i.e. '牛'
book: for new word entries to be created, give the title of the source where the word
was [first] encountered, i. e. '爾雅'. Optimally, this is the earliest attestation
create: True – unknown words will be added to the DB
terms including certain ter,s
activate: True – deactivated words will be reactivated
con, cursor: db connection from main method if applicable
"""
if not con:
con, cursor = database_connect()
internal = True
else:
internal = False
select = "select id, inactive from the_words where cleanword = '%s' order by id limit 1" % word
cursor.execute(select)
try:
word_id, inactive = cursor.fetchone()
except:
word_id, inactive = None, 0
if word_id is not None and inactive == 0:
return word_id
elif inactive == 0:
# do not create X 之 Y expressions as words or empty words
if re.compile('(\p{IsHan}{1,4}[之皆而如在所有曰]\p{IsHan}{1,4}|^[前後左右所其之在].+|.+[也者之曰]$)').match(word) or word == '' or word is None or word is False:
if internal:
con.close()
return False
elif create:
cursor.execute("select max(id)+1 from the_words")
next_id = cursor.fetchone()[0]
book_id = get_book_id(book) if book is not None else 156656
if book_id:
cursor.execute("insert into the_words (id, cleanword, nakedword, earliest_evidence_book_id) values (%s, '%s', '%s', %s)" % (next_id, word, word, book_id))
con.commit()
cursor.execute("select cleanword from the_words where id = %s" % (next_id))
inserted_word = cursor.fetchone()[0]
if inserted_word == word: # need to check if that actually worked, because of encoding issues with extended chars like 𦺋𧃒
# word_ids[word] = next_id
print("✳️ Created new word entry 【%s】, ID %s." % (word, next_id))
if internal: con.close()
return(next_id)
else:
cursor.execute("delete from the_words where id = %s" % (next_id))
con.commit()
if internal: con.close()
print("❌ Could not create a word entry【%s】(unsupported chars?)" % word)
return False
else:
print("❌ Could not create a word entry【%s】(Failed to retrieve source book id for %s)" % (word, book))
else:
print("❌ No entry for %s." % word)
return False
elif inactive == 1 and activate == True:
cursor.execute("update the_words set inactive = 0 where id = %s" % word_id)
con.commit()
print("Word entry %s for %s has been reactivated." % (word_id, word))
if internal: con.close()
else:
print("Word entry %s for %s is deactivated." % (word_id, word))
if internal: con.close()
return False
However, it's not that interesting to get the word_id, its more interesting to get information such as:
pínɡ 苹 attested in 戰國 · · 詩經 -1100 -700 -900.0 詩經 Shi Jing 戰國 after entering the word 苹.
- does the word belong to a specific category? This is very useful. To this aim, we used the following function:
def get_categories_by_words(wordlist=False):
"""
Input wordlist: a list of strings/words, i. e. ['妳', '我', '她']
Output: a dataframe with word data including their semantic categories.
Words can appear n times in the df when they have multiple categories.
will return all available data when there is no wordlist.
"""
con, cursor = database_connect()
if wordlist:
implode = "'" + "','".join(wordlist) + "'"
query = """
select wic.id as relation_id, w.id, w.cleanword, c.id as category_id, c.name, b.clearbook as category_source, b.startyear, b.endyear, b2.clearbook as word_earliest_attestation, b2.startyear as word_att_startyear, b2.endyear as word_att_endyear, wic.categorized_by, wic.comment, wic.inactive from the_words w
left outer join words_in_categories wic on w.id = wic.word_id
left join the_books b on wic.category_source = b.id
left join the_books b2 on coalesce(w.earliest_evidence_book_id,book_id) = b2.id
left join semantic_categories c on wic.category_id = c.id where """
if wordlist:
query += """ w.cleanword in (""" + implode + """) and """
query += "w.inactive = 0 and wic.inactive = 0 order by w.cleanword, c.name"
data = pd.read_sql(query, con, index_col='relation_id')
return data
Expected result (in this case, the word is part of two categories, but since one is a subcategory of another, the flora_plant_wormwood might be enough):
| id | cleanword | category_id | name | category_source | startyear | endyear | word_earliest_attestation | word_att_startyear | word_att_endyear | categorized_by | comment | inactive | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| relation_id | |||||||||||||
| 3548 | 253237 | 苹 | 17 | flora_plant | 爾雅 | -300 | -200 | 詩經 | -1100 | -700 | TS | parsed | 0 |
| 4136 | 253237 | 苹 | 58 | flora_plant_wormwood | CHI-KNOW-PO | 2022 | 2022 | 詩經 | -1100 |
- Then we could ask/automatically display all the words which belong to the same categories: We used:
def get_words_by_category(category_name, sub=True, source=None):
"""
Input category_name, i.e. 'fauna_beast'
sub: True: will also return words from sub(sub-)categories of the given category_name
sub: False: will strictly return the words in given category
source: leave empty to receive words regardless of the source of the categorization
use to restrict results to specific source, i.e. '爾雅'
Output: df of words in given category
"""
con, cursor = database_connect()
restrict = category_name
if sub == True: restrict += '%'
query = """
select wic.id as relation_id, w.id, w.cleanword, c.name, b.clearbook as category_source, b.startyear, b.endyear, b2.clearbook as word_earliest_attestation, b2.startyear as word_att_startyear, b2.endyear as word_att_endyear, wic.categorized_by, wic.comment, wic.inactive from words_in_categories wic
left join the_words w on wic.word_id = w.id
left join the_books b on wic.category_source = b.id
left join the_books b2 on coalesce(w.earliest_evidence_book_id,w.book_id) = b2.id
left join semantic_categories c on wic.category_id = c.id
where c.name like '%s' and w.inactive = 0 and wic.inactive = 0""" % (restrict)
if source:
query += " and b.clearbook = '%s' " % (source)
query += " order by c.name"
# print(query)
data = pd.read_sql(query, con, index_col='relation_id')
return data
- for does it belong to a list of "synonyms"? Again, what we used:
def retrieve_synonyms(wordlist=None, inverse=True, source=False, location=False):
"""
get synonym information from the DB.
Input:
* wordlist: a list of words that need synonyms retrieved.
retrieve_synonyms(wordlist = [term1, term2]) will return a list of synonyms to the terms in the list
retrieve_synonyms(wordlist = []) will return an empty list
retrieve_synonyms(wordlist = False) will return all the words from the database
* inverse: set to False to exclude "vice versa" entries where given word is the synonym
* source: restrict to synonyms from a specific source
* location: restrict to local synonyms from a specific location
Output:
* df of the results
"""
con, cursor = database_connect()
implode = "'" + "','".join(wordlist) + "'" if wordlist else ""
query = """
select s.node_id, w2.id, w2.cleanword, b2.clearbook word_earliest_attestation, b2.startyear word_att_startyear, b2.endyear word_att_endyear, w1.id as syn_id, w1.cleanword as syn, b1.clearbook syn_earliest_attestation, b1.startyear syn_att_startyear, b1.endyear syn_att_endyear, l.name_chn as local, b.clearbook relation_source, b.startyear, b.endyear, s.comment from synonyms s
left join the_words w1 on s.word_id_1 = w1.id
left join the_words w2 on s.word_id_2 = w2.id
left join the_books b on s.book_id = b.id
left join the_books b1 on coalesce(w1.earliest_evidence_book_id,w1.book_id) = b1.id
left join the_books b2 on coalesce(w2.earliest_evidence_book_id,w2.book_id) = b2.id
left join the_locations l on s.location_id = l.id
where 1 = 1 and w2.inactive = 0 and s.inactive = 0 """
if wordlist :
query += "and w1.cleanword in (" + implode + ")"
elif wordlist != [] and wordlist is None :
query += ""
else:
query += "and w1.cleanword in ('')"
if source: query += "and b.clearbook = '%s'" % source
if inverse:
query += """
union select s.node_id, w1.id, w1.cleanword, b1.clearbook word_earliest_attestation, b1.startyear word_att_startyear, b1.endyear word_att_endyear, w2.id as syn_id, w2.cleanword as syn, b2.clearbook syn_earliest_attestation, b2.startyear syn_att_startyear, b2.endyear syn_att_endyear, l.name_chn as local, b.clearbook relation_source, b.startyear, b.endyear, s.comment from synonyms s
left join the_words w1 on s.word_id_1 = w1.id
left join the_words w2 on s.word_id_2 = w2.id
left join the_books b on s.book_id = b.id
left join the_books b1 on coalesce(w1.earliest_evidence_book_id,w1.book_id) = b1.id
left join the_books b2 on coalesce(w2.earliest_evidence_book_id,w2.book_id) = b2.id
left join the_locations l on s.location_id = l.id
where 1 = 1 and w1.inactive = 0 and s.inactive = 0 """
if wordlist :
query += " and w2.cleanword in (" + implode + ")"
elif wordlist != [] and wordlist is None :
query += ""
else:
query += "and w2.cleanword in ('')"
if location: query += " and l.name_chn = '%s'" % location
if source: query += " and b.clearbook = '%s'" % source
data = pd.read_sql(query, con, index_col='node_id')
return data
We could also start by choosing a category, and ask for all the words that belong to this category.
All the code that I quote was made available to the CHI-KNOW-PO project by Tilman Schalmey.
You can pick whatever you find interesting! One query is a good start.
People will be nice to show as well, but it's more complex.