# I saved this script as gist because I wrote it a lot of times. # It has support for remembering line numbers and so on what is not used. # It was originally written in C by me and ported to Python. # The original application did use these features. # Written by retoor@molodetz.nl # This script processes text files to identify words, their positions, and their frequency of occurrence. It uses command-line arguments to query or display popular words and stores results in a SQLite database. # Imports: # - argparse: For handling command-line arguments # - sqlite3: A library to control and manage SQLite databases # - pathlib: To work with filesystem paths in an object-oriented way # MIT License: # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import argparse import sqlite3 import pathlib parser = argparse.ArgumentParser() parser.add_argument('--find', type=str, required=False, default="") parser.add_argument('--index', action='store_true') parser.add_argument('--popular', action='store_true') args = parser.parse_args() def is_valid_char(c): return c.isalnum() or c == '_' def process_file(file_path): word = [] word_start = -1 word_end = 0 word_line = 0 word_alinia = 0 word_length = 0 new_line_count = 0 pos = 0 line = 1 alinia = 1 words = {} with open(file_path, 'r') as f: while c := f.read(1): pos += 1 valid = True if c == '.': line += 1 valid = False if c == '\n': new_line_count += 1 valid = False if not is_valid_char(c): valid = False if not valid: if word_start > -1: word_end = pos - 1 word_length = word_end - word_start word_str = ''.join(word) print(f"{word_str} {word_start} {word_end} {word_length} {word_line} {word_alinia} {alinia}") if word_str not in words: words[word_str] = 0 words[word_str] += 1 word_start = -1 word = [] continue if new_line_count >= 2: new_line_count = 0 alinia += 1 word.append(c) if word_start == -1: word_start = pos word_line = line word_alinia = alinia return words class WordDb: def __init__(self, path): self.path = path self.conn = sqlite3.connect(path) self.cursor = self.conn.cursor() self.conn.commit() self.words = {} def reset(self): self.words = {} self.cursor.execute("DROP TABLE IF EXISTS words") self.cursor.execute(""" CREATE TABLE words ( word TEXT NOT NULL, count INTEGER NOT NULL ) """) self.conn.commit() def insert(self, word, count): if word not in self.words: self.words[word] = count self.cursor.execute("INSERT INTO words (word, count) VALUES (?, ?)", (word, count)) else: self.words[word] += count self.cursor.execute("UPDATE words SET count = ? WHERE word = ?", (self.words[word], word)) def commit(self): self.conn.commit() def total_count(self): self.cursor.execute("SELECT SUM(count) FROM words") return self.cursor.fetchone()[0] def get(self, word): self.cursor.execute("SELECT count FROM words WHERE word = ?", (word,)) return self.cursor.fetchone()[0] def most_popular(self, count): self.cursor.execute("SELECT word, count FROM words ORDER BY count DESC LIMIT ?", (count,)) return list(self.cursor.fetchall()) def __del__(self): self.commit() self.conn.close() print("Database closed") db = WordDb("tags.db") def index(): words = {} for f in pathlib.Path("logs_plain").iterdir(): for key, value in process_file(f).items(): db.insert(key, value) from pprint import pprint as pp pp(db.most_popular(100)) db.commit() print(len(words.keys())) if args.find: print(db.get(args.find)) if args.popular: for item in db.most_popular(300): print(item) print(db.total_count()) if args.index: db.reset() index()