Add tokenizer.py
Tokenizer.
This commit is contained in:
parent
53b99f18fc
commit
03fa14a1c7
177
tokenizer.py
Normal file
177
tokenizer.py
Normal file
@ -0,0 +1,177 @@
|
||||
|
||||
# I saved this script as gist because I wrote it a lot of times.
|
||||
# It has support for remembering line numbers and so on what is not used.
|
||||
# It was originally written in C by me and ported to Python.
|
||||
# The original application did use these features.
|
||||
|
||||
|
||||
# Written by retoor@molodetz.nl
|
||||
|
||||
# This script processes text files to identify words, their positions, and their frequency of occurrence. It uses command-line arguments to query or display popular words and stores results in a SQLite database.
|
||||
|
||||
# Imports:
|
||||
# - argparse: For handling command-line arguments
|
||||
# - sqlite3: A library to control and manage SQLite databases
|
||||
# - pathlib: To work with filesystem paths in an object-oriented way
|
||||
|
||||
# MIT License:
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
import pathlib
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--find', type=str, required=False, default="")
|
||||
parser.add_argument('--index', action='store_true')
|
||||
parser.add_argument('--popular', action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
def is_valid_char(c):
|
||||
return c.isalnum() or c == '_'
|
||||
|
||||
def process_file(file_path):
|
||||
word = []
|
||||
word_start = -1
|
||||
word_end = 0
|
||||
word_line = 0
|
||||
word_alinia = 0
|
||||
word_length = 0
|
||||
new_line_count = 0
|
||||
pos = 0
|
||||
line = 1
|
||||
alinia = 1
|
||||
words = {}
|
||||
with open(file_path, 'r') as f:
|
||||
while True:
|
||||
c = f.read(1)
|
||||
if not c:
|
||||
break
|
||||
|
||||
pos += 1
|
||||
valid = True
|
||||
|
||||
if c == '.':
|
||||
line += 1
|
||||
valid = False
|
||||
|
||||
if c == '\n':
|
||||
new_line_count += 1
|
||||
valid = False
|
||||
|
||||
if not is_valid_char(c):
|
||||
valid = False
|
||||
|
||||
if not valid:
|
||||
if word_start > -1:
|
||||
word_end = pos - 1
|
||||
word_length = word_end - word_start
|
||||
word_str = ''.join(word)
|
||||
print(f"{word_str} {word_start} {word_end} {word_length} {word_line} {word_alinia} {alinia}")
|
||||
if word_str not in words:
|
||||
words[word_str] = 0
|
||||
words[word_str] += 1
|
||||
|
||||
word_start = -1
|
||||
word = []
|
||||
continue
|
||||
|
||||
if new_line_count >= 2:
|
||||
new_line_count = 0
|
||||
alinia += 1
|
||||
|
||||
word.append(c)
|
||||
|
||||
if word_start == -1:
|
||||
word_start = pos
|
||||
word_line = line
|
||||
word_alinia = alinia
|
||||
return words
|
||||
|
||||
class WordDb:
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self.conn = sqlite3.connect(path)
|
||||
self.cursor = self.conn.cursor()
|
||||
self.conn.commit()
|
||||
self.words = {}
|
||||
|
||||
def reset(self):
|
||||
self.words = {}
|
||||
self.cursor.execute("DROP TABLE IF EXISTS words")
|
||||
self.cursor.execute("""
|
||||
CREATE TABLE words (
|
||||
word TEXT NOT NULL,
|
||||
count INTEGER NOT NULL
|
||||
)
|
||||
""")
|
||||
self.conn.commit()
|
||||
|
||||
def insert(self, word, count):
|
||||
if word not in self.words:
|
||||
self.words[word] = count
|
||||
self.cursor.execute("INSERT INTO words (word, count) VALUES (?, ?)", (word, count))
|
||||
else:
|
||||
self.words[word] += count
|
||||
self.cursor.execute("UPDATE words SET count = ? WHERE word = ?", (self.words[word], word))
|
||||
|
||||
def commit(self):
|
||||
self.conn.commit()
|
||||
|
||||
def total_count(self):
|
||||
self.cursor.execute("SELECT SUM(count) FROM words")
|
||||
return self.cursor.fetchone()[0]
|
||||
|
||||
def get(self, word):
|
||||
self.cursor.execute("SELECT count FROM words WHERE word = ?", (word,))
|
||||
return self.cursor.fetchone()[0]
|
||||
|
||||
def most_popular(self, count):
|
||||
self.cursor.execute("SELECT word, count FROM words ORDER BY count DESC LIMIT ?", (count,))
|
||||
return list(self.cursor.fetchall())
|
||||
|
||||
def __del__(self):
|
||||
self.commit()
|
||||
self.conn.close()
|
||||
print("Database closed")
|
||||
|
||||
db = WordDb("tags.db")
|
||||
|
||||
def index():
|
||||
words = {}
|
||||
for f in pathlib.Path("logs_plain").iterdir():
|
||||
for key, value in process_file(f).items():
|
||||
db.insert(key, value)
|
||||
|
||||
from pprint import pprint as pp
|
||||
pp(db.most_popular(100))
|
||||
db.commit()
|
||||
print(len(words.keys()))
|
||||
|
||||
if args.find:
|
||||
print(db.get(args.find))
|
||||
|
||||
if args.popular:
|
||||
for item in db.most_popular(300):
|
||||
print(item)
|
||||
print(db.total_count())
|
||||
|
||||
if args.index:
|
||||
db.reset()
|
||||
index()
|
Loading…
Reference in New Issue
Block a user