ns data-analysis.book-sales-analysis.core-helpers-v2
(:import [java.text Normalizer Normalizer$Form]
(
[java.io ByteArrayInputStream ObjectInputStream]):require [tablecloth.api :as tc]
(:as ds]
[tech.v3.dataset :as tcc]
[tablecloth.column.api :as str]
[clojure.string :as jt]
[java-time.api :as stats])) [fastmath.stats
Data Transformation Functions
Common data processing functions used across multiple analysis files
Scicloj Helpers
defn merge-csvs [file-list options]
(->> (mapv #(tc/dataset % options) file-list)
(apply tc/concat))) (
Column and Content Sanitizers
defn sanitize-str
("Sanitizes a string for use as a slug or identifier.
Replaces underscores and spaces with hyphens, removes diacritics and parentheses, and converts to lower-case.
Intended for general-purpose text like book titles."
[s]if (or (nil? s) (empty? s))
(
slet [hyphens (str/replace s #"_" "-")
(
trimmed (str/trim hyphens)
nfd-normalized (Normalizer/normalize trimmed Normalizer$Form/NFD)#"\p{InCombiningDiacriticalMarks}+" "")
no-diacritics (str/replace nfd-normalized #" " "-")
no-spaces (str/replace no-diacritics #"\(|\)" "")
no-brackets (str/replace no-spaces
lower-cased (str/lower-case no-brackets)] lower-cased)))
defn sanitize-column-name-str
("Sanitizes a string for use as a dataset column name.
More aggressive than `sanitize-str`, it also converts slashes to hyphens, collapses multiple hyphens,
and removes special substrings like '(YYYY-MM)'."
[s]if (or (nil? s) (empty? s))
(
s-> s
(#"\(YYYY-MM\)" "") ; special removal
(str/replace
str/trim
(str/lower-case)#"_" "-") ; underscore to hyphens
(str/replace #" " "-")
(str/replace #"\/" "-") ; slash to hyphens
(str/replace #"-{2,}" "-") ; multiple hyphens to one
(str/replace % Normalizer$Form/NFD)) ; nfd-normalized
(#(Normalizer/normalize #"\p{InCombiningDiacriticalMarks}+" "") ; no-diacritics
(str/replace #"\(|\)" "")))) (str/replace
defn sanitize-category-str
("Sanitizes a string representing categories.
Similar to other sanitizers, but specifically handles comma-separated lists by removing the space
after a comma (e.g., 'a, b' -> 'a,b')."
[s]if (or (nil? s) (empty? s))
(
s-> s
(
str/trim
(str/lower-case)#"\,\s" ",") ; underscore to hyphens
(str/replace #"\s" "-")
(str/replace #"\/" "-") ; slash to hyphens
(str/replace #"-{2,}" "-") ; multiple hyphens to one
(str/replace % Normalizer$Form/NFD)) ; nfd-normalized
(#(Normalizer/normalize #"\p{InCombiningDiacriticalMarks}+" "") ; no-diacritics (dočasně)
(str/replace #"\(|\)" "")))) (str/replace
defn parse-book-name [s]
(-> s ;; proti parse-books bere jen řetězec
(#"," "")
(str/replace #"\+" "")
(str/replace
(str/trim)
sanitize-category-str#"^3" "k3")
(str/replace #"^5" "k5"))) (str/replace
defn parse-csv-date [date-str]
(let [month-names ["led" "úno" "bře" "dub" "kvě" "čvn" "čvc" "srp" "zář" "říj" "lis" "pro"]
(format "%02d" %)
pad-month #(fn [s]
parse-full-date (let [month (Integer/parseInt (subs s 3 5))]
(str (subs s 6 10) "-01-" (pad-month month))))
(fn [s]
parse-short-date (let [[month-str year-str] (str/split s #"\.")
(inc (.indexOf month-names month-str))
month (+ 2000 (Integer/parseInt year-str))]
year (str year "-01-" (pad-month month))))]
(try
("yyyy-dd-MM"
(jt/local-date if (> (count date-str) 6)
(
(parse-full-date date-str)
(parse-short-date date-str)))catch Exception _
(str "Chyba: " date-str))))) (
defn parse-books-from-list
("Parses a book names from string `s` separated by commas into vector of cleaned keywords."
[s]if (seq s) (->> (str/split s #",\s\d+")
(map #(str/replace % #"\d*×\s" ""))
(map #(str/replace % #"," ""))
(map #(str/replace % #"\(A\+E\)|\[|\]|komplet|a\+e|\s\(P\+E\+A\)|\s\(e\-kniha\)|\s\(P\+E\)|\s\(P\+A\)|\s\(E\+A\)|papír|papir|audio|e\-kniha|taška" ""))
(map #(str/replace % #"\+" ""))
(map #(str/trim %))
(map sanitize-str)
(map #(str/replace % #"\-\-.+$" ""))
(map #(str/replace % #"\-+$" ""))
(map #(str/replace % #"^3" "k3"))
(map #(str/replace % #"^5" "k5"))
(remove (fn [item] (some (fn [substr] (str/includes? (name item) substr))
("balicek" "poukaz" "zapisnik" "limitovana-edice" "taska" "aktualizovane-vydani" "cd" "puvodni-vydani/neni-skladem"
["merch"])))
distinct
mapv keyword))
(nil))
Metadata Enriching and Convenience Functions
def end-time
(2025 10 1)) (jt/local-date
defn months-between "Calculate how many months a product has been on market"
(
[start-date end-date]let [days (if (and start-date end-date)
(:days)
(jt/time-between start-date end-date 0)]
long (Math/round (/ days 30.4375))))) (
defn months-on-market
("Months `book` is on a market. Zero if not at all."
[books-ds book end-date]let [date (try
(-> books-ds
(:titul :datum-zahajeni-prodeje])
(tc/select-columns [name (:titul %)) (name book)))
(tc/select-rows #(str/starts-with? (:datum-zahajeni-prodeje 0))
(tc/get-entry catch Exception e nil))
(if (nil? date) 0 (months-between date end-date))]
month ( month))
defn czech-author? [book-title]
(let [czech-books #{:k30-hodin
(:k365-anglickych-cool-fraz-a-vyrazov
:k365-anglickych-cool-frazi-a-vyrazu
:bulbem-zachranare
:hacknuta-cestina
:handmade-byznys
:hot
:hry-site-porno
:jak-na-site
:jak-sbalit-zenu-2.0
:konec-prokrastinace
:let-your-english-september
:myty-a-nadeje-digitalniho-sveta
:na-volne-noze
:napoleonuv-vojak
:nedelni-party-s-picassem
:restart-kreativity
:sport-je-bolest
:stat-se-investorem
:temne-pocatky-ceskych-dejin
:uc-jako-umelec
:velka-kniha-fuckupu
:zamilujte-se-do-anglictiny
:pretizeny
:od-chaosu-ke-smyslu
:very-hard-orechy
:heureka!}]
if (str/starts-with? (str book-title) "book") ;; this is a part used to add flags of Czech books into fully anonymized dataset
(rand-int 2)
(if (contains? czech-books (keyword book-title)) 1 0)))) (
One-Hot Encoding Functions
defn onehot-encode-by-customers ;; FIXME needs refactor and simplification :)
("One-hot encode dataset aggregated by customer.
Each customer gets one row with 0/1 values for each book they bought.
Used for market basket analysis, customer segmentation, etc."
[raw-ds]let [;; First, aggregate all purchases by customer
(-> raw-ds
customer+orders (:zakaznik)
(ds/drop-missing = "" (str/trim (:zakaznik %))))
(tc/drop-rows #(:produkt-produkty)
(ds/drop-missing :zakaznik])
(tc/group-by [:all-products #(str/join ", " (tc/column % :produkt-produkty))})
(tc/aggregate {:summary :all-products}))
(tc/rename-columns {;; Get all unique books from all the lines
->> (tc/column customer+orders :all-products)
all-titles (mapcat parse-books-from-list)
(distinct
sort)
;; For each customer create one aggregated row with all purchases in 0/1 format
map
customers->rows (fn [customer-row]
(let [customer-name (:zakaznik customer-row)
(set (parse-books-from-list (:all-products customer-row)))
books-bought-set (reduce (fn [acc book]
one-hot-map (assoc acc book (if (contains? books-bought-set book) 1 0)))
(
{}
all-titles)]merge {:zakaznik customer-name}
(
one-hot-map))):as-maps))
(tc/rows customer+orders ;; Create new dataset from one-hot data
one-hot-ds (tc/dataset customers->rows)];; Return dataset with one-hot encoding
one-hot-ds))
Statistical Functions for Apriori Analysis
defn calculate-support
("Calculate support for a given itemset in a one-hot-encoded dataset.
Support = (rows containing itemset) / (total rows)"
[dataset itemset]let [total-transactions (tc/row-count dataset)
(-> dataset
transactions-with-itemset (fn [row] (every? #(not (zero? (get row %))) itemset)))
(tc/select-rows (
tc/row-count)]if (zero? total-transactions)
(0.0
double (/ transactions-with-itemset total-transactions))))) (
Visuals
defn color-hex [support min-support max-support]
(let [min-opacity 20
(255
max-opacity ;; Map support from [min-support, max-support] to [min-opacity, max-opacity]
if (= min-support max-support)
opacity (;; Handle edge case where min and max are the same
int (/ (+ min-opacity max-opacity) 2))
(int (+ min-opacity
(* (- max-opacity min-opacity)
(/ (- support min-support)
(- max-support min-support))))))
(;; Ensure opacity stays within bounds
min max-opacity (max min-opacity opacity))
clamped-opacity (format "%02x" clamped-opacity)]
hex-opacity (str "#c1ab55" hex-opacity))) (
Correlation functions
defn corr-a-x-b
("Creates a correlation matrix with book columns and the added :book column \n
- `ds` is dataset \n
Example: \n
=> _unnamed [2 3]: \n
| :a | :b | :book |
|------------:|------------:|-------|
| 1.00000000 | -0.12121831 | :a |
| -0.12121831 | 1.00000000 | :b |"
[ds]let
(
[columns (tc/column-names ds)-> ds
clean-ds (:zakaznik]))]
(tc/drop-columns [-> (zipmap columns (stats/correlation-matrix (tc/columns clean-ds)))
(
tc/dataset:book columns)))) (tc/add-column
defn corr-matrix
("Creates a correlation matrix with books sorted by publication date (chronological order) \n
`books-onehot` – one-hot encoded dataset"
[books-onehot books-meta]-> (corr-a-x-b (-> books-onehot
(
(tc/reorder-columnssort-by #(months-on-market books-meta % end-time)
(
(tc/column-names books-onehot))):zakaznik])))
(tc/drop-columns [
(tc/reorder-columnssort-by #(months-on-market books-meta % end-time)
(
(tc/column-names books-onehot))):sort-col
(tc/add-column fn [ds] (map #(months-on-market books-meta % end-time)
(:book))))
(tc/column ds :sort-col)
(tc/order-by :sort-col))) (tc/drop-columns
Export helper functions from other namespaces for convenience
def sanitize-str sanitize-str) (
def merge-csvs merge-csvs) (
def parse-books-from-list parse-books-from-list) (
def sanitize-column-name-str sanitize-column-name-str) (
def parse-csv-date parse-csv-date) (
println "Core helpers loaded.") (
NoteOUT
Core helpers loaded.
nil