import pandas as pd #untuk membersihkan, memanipulasi, dan menganalisis data
import faiss #untuk pencarian cepat terhadap vektor berdimensi tinggi (Facebook AI Similarity Search)
import numpy as np #untuk komputasi numerik dan analisis data

data = pd.read_csv("data/US-E-commerce-records-2020.csv", encoding='cp1252')
data.head()

Nama: Rina | Gaji: 10jt | Divisi: Finance

def penggabungan_kolom(df, kolom_data):
    df['teks'] = df[kolom_data].astype('str').agg(' | '.join, axis = 1)
    return df

penggabungan_kolom(df = data, kolom_data=['Order Date', 'Row ID', 'Order ID', 'Ship Mode', 'Customer ID', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'])

column_to_move = 'teks'

# Get all current columns as a list
cols = data.columns.tolist()

# Remove the target column from its current position
cols.remove(column_to_move)

# Insert the target column at the beginning (index 0)
cols.insert(0, column_to_move)

# Reindex the DataFrame using the new column order
data = data.reindex(columns=cols)

data.head()

data.teks[0]

'01-01-20 | 849 | CA-2017-107503 | Standard Class | GA-14725 | Consumer | United States | Lorain | Ohio | 44052 | East | FUR-FU-10003878 | Furniture | Furnishings | Linden 10" Round Wall Clock, Black | 48.896 | 4 | 0.2 | 8.5568'

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

/opt/anaconda3/envs/dss_rag_llm/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
/opt/anaconda3/envs/dss_rag_llm/lib/python3.10/site-packages/torch/backends/mps/__init__.py:22: UserWarning: Skipping device NVIDIA GeForce GT 650M that does not support Metal 2.0 (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1711403210267/work/aten/src/ATen/mps/MPSDevice.mm:101.)
  return torch._C._mps_is_available()

# persiapan query & embedding baru
query =  "office supplies"
embedding_query_baru = model.encode(query, convert_to_numpy=True)
embedding_query_baru

array([-0.716872  ,  0.45632547, -0.63171613,  0.17367159,  0.5328752 ,
       -0.09007177, -0.07108808, -0.16285414,  0.25690147,  0.17362742,
       -0.6282766 ,  0.7277662 ,  0.10182606, -0.55017585, -0.45448703,
       -0.68073285, -0.5745402 ,  0.5512858 , -0.5948805 ,  0.39587298,
        0.6755898 ,  0.13175751, -0.22515845, -0.24932042,  0.3801938 ,
        0.6198754 , -0.59237254, -0.31570452,  0.11191514, -0.86276555,
        0.30706227, -0.31802088,  0.7374637 ,  0.38981938,  0.45587975,
        0.0024129 ,  0.2376375 , -0.21247113,  0.3418254 , -0.39475846,
       -0.4128146 , -0.1768777 , -0.37005985,  0.6327628 ,  0.15006693,
       -0.0750848 , -0.25734234, -0.2713463 , -0.01010478,  0.6614528 ,
       -0.06255485, -0.03240326, -0.62716305,  0.45598036, -0.5189893 ,
        0.07669385,  0.44694984,  0.04969588, -0.18693882, -0.22442754,
        0.3027749 , -0.28193465, -0.7240948 ,  0.5458953 , -0.02059112,
        0.41022623, -0.30816698,  0.31400004, -1.0579426 , -0.27516127,
       -0.88060695, -0.09642053, -0.13878804, -0.1225189 ,  0.8096336 ,
       -0.47774458, -0.45532733, -0.26063493,  0.0688246 ,  0.5531689 ,
       -0.20091069,  0.50306493,  0.24141857,  1.5878574 , -0.01096543,
        0.27468482,  0.37082553,  0.14636551,  0.31162113, -0.5302933 ,
        0.61639094,  0.08248893,  0.54348993, -0.2554055 ,  0.13642766,
        0.38311273,  0.468063  ,  0.35530812,  0.17560157,  1.0113138 ,
       -0.7176941 , -0.09537454,  0.5049549 , -0.93902993, -0.23732364,
       -0.7826657 ,  0.2403502 , -0.22476307, -0.6803273 , -0.42944977,
       -0.08635149,  0.04775952, -1.0752567 ,  0.14966644,  0.26317772,
       -0.33440098, -0.45266908, -0.25068796, -0.84692335, -0.04420531,
        0.26168993,  0.40996048, -0.47376207, -0.19961348, -0.42054746,
       -0.41274512,  0.47495472,  0.48490545, -0.34309387,  0.1997122 ,
        0.09500191,  0.7387729 ,  0.5688182 ,  0.05905704,  0.0619497 ,
        0.02418515,  0.08326092,  0.37355143,  0.18075053,  1.196794  ,
       -0.0966131 ,  1.3010595 ,  0.07952936,  0.1090833 ,  0.40113896,
        0.7450007 , -0.66891634,  0.3075547 ,  0.29246238,  0.05144548,
        0.21325427, -0.07839283,  0.3206023 ,  0.65356326, -0.16836482,
       -0.03705693, -0.11945157,  0.11124043, -0.10847221, -0.33006153,
       -0.25534928, -0.16391616, -0.22077248,  0.26823807, -0.25441164,
       -0.47099403,  0.67622685, -0.19818729,  0.34653196,  0.56134045,
        0.96493804, -0.17697784,  0.41002962,  0.19220953,  0.35834572,
        0.6034603 ,  0.7987442 ,  0.3730126 , -0.23874009,  0.18682435,
       -0.0600431 ,  0.0529494 , -0.02068056, -0.5020242 , -0.04756421,
        0.2056844 ,  0.5267801 , -0.25270036, -0.4280783 ,  1.0895662 ,
        0.04784279,  0.2098447 ,  0.2877956 ,  0.2227553 , -0.4405536 ,
       -0.253803  ,  0.3634785 , -0.68804485, -0.6655482 , -0.44891033,
        0.22508246, -0.0760952 , -0.34520218,  0.27943   ,  0.5547304 ,
        0.05405214, -0.86148787, -0.5271838 , -0.6231332 ,  0.11703995,
        0.08456165, -0.9168004 , -0.13979214,  0.6862396 ,  0.2635855 ,
        0.46252382,  0.09943815,  0.14254077, -0.22160771, -0.14332591,
       -0.6255703 , -0.05509243,  0.10144128, -0.54967535,  0.3399599 ,
       -0.6828388 , -0.47692907,  0.10588402,  0.36131302,  0.32136747,
        0.30153745, -0.7830902 ,  0.05726122,  0.16556762, -0.4088861 ,
       -0.2895187 ,  0.07728919, -0.29874155, -0.15275204,  0.6085232 ,
       -0.07868923,  0.08964686, -0.38627154, -0.14655973, -0.2746126 ,
        0.36797458,  0.97895706,  0.08421431,  0.62297696,  0.14714462,
       -1.0969033 , -0.6118962 ,  0.30680338,  0.6172208 , -0.31196523,
       -0.70980257, -0.04590397,  0.17044333,  0.08085597, -0.61712575,
        0.49579048, -0.53421164,  0.13844502,  0.21207434,  0.9036713 ,
        0.01203351,  0.45673686,  0.34121707, -0.11050098, -0.4913195 ,
       -0.7303134 , -0.65920424, -0.02211818,  0.02996711,  0.3902007 ,
       -0.09468953, -0.3763703 , -0.34241357,  0.0221408 ,  0.7599114 ,
        0.01285066, -0.8379364 , -0.2632665 ,  0.14773248, -0.17956498,
        0.00189211, -0.03859607, -0.5122322 ,  0.14051217,  0.11144556,
        0.40959734, -0.15623197, -0.15883264, -0.07600958,  0.19547437,
       -0.22216107,  0.52617407, -0.43367544, -0.14048727,  0.37092495,
       -0.21578501, -0.27290767, -0.33180273, -0.02906696, -0.06637581,
       -0.12433513,  0.22082832,  0.3663525 , -0.09700422, -0.03339585,
        0.5353457 ,  0.02603964, -0.3363085 , -0.06546272, -0.39320305,
        0.29911685,  0.7403349 ,  0.94562316, -0.00529283, -0.14016496,
       -0.15291035,  0.43072325,  0.32022583,  0.11318783, -0.67600673,
       -0.8792449 , -0.42573196,  0.48362035, -0.04826087, -0.24121065,
        0.23637514, -0.88038015,  0.01524752,  0.16370122,  0.30416876,
       -0.07886551,  0.23292688, -0.07943282, -0.50966555, -0.6178465 ,
        0.42228332, -0.06621126,  0.20476288, -0.01969434,  0.23240395,
        0.2836976 , -0.08667324,  0.71448624, -0.20879152, -0.09402305,
        0.4570567 , -0.08434385,  0.21833211, -0.0117718 ,  0.45361462,
       -0.24470225,  0.3679428 ,  0.11360676, -0.4544754 ,  0.10264897,
        0.23915434, -0.86537325, -0.85225165,  0.37619138, -0.21463501,
       -0.20092209,  0.15147522, -0.2910621 , -0.3154092 ,  0.01118582,
        0.48642778, -0.22938201,  0.2643312 ,  0.61352175, -0.0556865 ,
       -0.1106727 ,  0.25453347, -0.13015775,  0.16870694,  0.4085933 ,
       -0.4422584 ,  0.08656847,  0.5610304 ,  0.01503886], dtype=float32)

embedding_output = model.encode(data['teks'], convert_to_numpy= True)

# menghtiung cosine similarity
cosine_scores = util.cos_sim(embedding_query_baru, embedding_output)
cosine_scores

tensor([[0.0433, 0.1144, 0.2621,  ..., 0.1599, 0.2248, 0.2101]])

cosine_scores = util.cos_sim(embedding_query_baru, embedding_output)
cosine_scores

tensor([[0.0433, 0.1144, 0.2621,  ..., 0.1599, 0.2248, 0.2101]])

data['teks'][2]

'01-01-20 | 6683 | CA-2017-154466 | First Class | DP-13390 | Home Office | United States | Franklin | Wisconsin | 53132 | Central | OFF-BI-10002012 | Office Supplies | Binders | Wilson Jones Easy Flow II Sheet Lifters | 3.6 | 2 | 0.0 | 1.728'

# melakukan perhitungan
embedding_dataframe = embedding_output / np.linalg.norm(embedding_output, axis=1, keepdims=True)
embedding_dataframe = embedding_dataframe.astype('float32')

# mengambil nilai embedding
dimension = embedding_dataframe.shape[1]
dimension

384

index = faiss.IndexFlatL2(dimension)
index.add(embedding_dataframe)

embedding_index_query = model.encode([query])

D, I = index.search(embedding_index_query, k = 5)

D

array([[66.2     , 66.301384, 66.56446 , 66.59857 , 66.62447 ]],
      dtype=float32)

I

array([[ 462,  419, 3127, 2705,  189]])

# membuat sebuah fungsi
def build_faiss_index_cosine(teks):
    # Bagian untuk melakukan embeddings
    embedding = model.encode(teks, convert_to_numpy=True)

    # Melakukan perhitungan cosine
    embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True)
    embedding = embedding.astype('float32')

    # Melakukan indexing
    dimension = embedding.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embedding)

    return index, embedding

build_faiss_index_cosine(data['teks'])

(<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x248375e60> >,
 array([[-0.07242865, -0.08835787,  0.13058807, ..., -0.03424174,
         -0.05709865,  0.04164357],
        [ 0.02854108, -0.07424114,  0.04607036, ..., -0.04158579,
         -0.03203242,  0.02477697],
        [-0.16480534, -0.07522708,  0.07062422, ..., -0.01027824,
         -0.02206857,  0.00524082],
        ...,
        [-0.05696863, -0.05658817,  0.06185489, ..., -0.04932694,
         -0.05349618,  0.055727  ],
        [-0.07126582, -0.089638  ,  0.04064946, ..., -0.01434754,
         -0.0163991 ,  0.00132702],
        [-0.01710184, -0.10491675,  0.05089856, ..., -0.09893939,
         -0.03608089,  0.03270049]], dtype=float32))

data['teks'][419]

'21-03-20 | 9365 | CA-2017-111591 | Standard Class | PS-18970 | Home Office | United States | Seattle | Washington | 98105 | West | OFF-ST-10001809 | Office Supplies | Storage | Fellowes Officeware Wire Shelving | 359.32 | 4 | 0.0 | 7.1864'

def generate_answer(query, context, api_key):
    openai.api_key = api_key
    system_message = "Kamu adalah asisten cerdas yang menjawab pertanyaan berdasarkan data yang diberikan."
    user_message = f"""
    Pertanyaan: {query}

    Data yang relevan:
    {context}
    """
    response = openai.ChatCompletion.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        temperature=0.3,
        max_tokens=1000
    )
    return response.choices[0].message["content"]

generate_answer(query= 'Dimanakah puncak gunung tertinggi di pulau Jawa berada?',
                context= f""" There are at least 108 mountains on Earth with elevations of 7,200 m (23,622 ft; 4 mi) or greater above sea level. Of these, 14 are more than 8,000 m (26,247 ft; 5 mi).[1] The vast majority of these mountains are part of either the Himalayas or the Karakoram mountain ranges located on the edge of the Indian Plate and Eurasian Plate in China, India, Nepal, and Pakistan.""",
                api_key= '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')

'Data yang diberikan tidak secara langsung menyebutkan puncak gunung tertinggi di pulau Jawa. Namun, berdasarkan pengetahuan umum, puncak gunung tertinggi di pulau Jawa adalah Gunung Semeru, yang terletak di Jawa Timur. Gunung Semeru memiliki ketinggian sekitar 3.676 meter di atas permukaan laut. Jadi, puncak gunung tertinggi di pulau Jawa berada di Gunung Semeru, Jawa Timur.'

Building LLM Applications for Structured Data Insights¶

Large Language Model & Retrieval Augmented Generation¶

Tahapan Dalam Membuat RAG¶

Step 1: Mempersiapkan Data Tabular¶

Step 2: Transformasi Data ke Format Teks¶

Step 3: Mencari Kesamaan Teks¶

Step 3.1: Embedding Teks¶

Step 3.2: FAISS Indexing Dari Pertanyaan¶

Mempersiapkan Generative AI yang terkoneksi dengan RAG¶

	Order Date	Row ID	Order ID	Ship Mode	Customer ID	Segment	Country	City	State	Postal Code	Region	Product ID	Category	Sub-Category	Product Name	Sales	Quantity	Discount	Profit
0	01-01-20	849	CA-2017-107503	Standard Class	GA-14725	Consumer	United States	Lorain	Ohio	44052	East	FUR-FU-10003878	Furniture	Furnishings	Linden 10" Round Wall Clock, Black	48.896	4	0.2	8.5568
1	01-01-20	4010	CA-2017-144463	Standard Class	SC-20725	Consumer	United States	Los Angeles	California	90036	West	FUR-FU-10001215	Furniture	Furnishings	Howard Miller 11-1/2" Diameter Brentwood Wall ...	474.430	11	0.0	199.2606
2	01-01-20	6683	CA-2017-154466	First Class	DP-13390	Home Office	United States	Franklin	Wisconsin	53132	Central	OFF-BI-10002012	Office Supplies	Binders	Wilson Jones Easy Flow II Sheet Lifters	3.600	2	0.0	1.7280
3	01-01-20	8070	CA-2017-151750	Standard Class	JM-15250	Consumer	United States	Huntsville	Texas	77340	Central	OFF-ST-10002743	Office Supplies	Storage	SAFCO Boltless Steel Shelving	454.560	5	0.2	-107.9580
4	01-01-20	8071	CA-2017-151750	Standard Class	JM-15250	Consumer	United States	Huntsville	Texas	77340	Central	FUR-FU-10002116	Furniture	Furnishings	Tenex Carpeted, Granite-Look or Clear Contempo...	141.420	5	0.6	-187.3815

	teks	Order Date	Row ID	Order ID	Ship Mode	Customer ID	Segment	Country	City	State	Postal Code	Region	Product ID	Category	Sub-Category	Product Name	Sales	Quantity	Discount	Profit
0	01-01-20 \| 849 \| CA-2017-107503 \| Standard Cla...	01-01-20	849	CA-2017-107503	Standard Class	GA-14725	Consumer	United States	Lorain	Ohio	44052	East	FUR-FU-10003878	Furniture	Furnishings	Linden 10" Round Wall Clock, Black	48.896	4	0.2	8.5568
1	01-01-20 \| 4010 \| CA-2017-144463 \| Standard Cl...	01-01-20	4010	CA-2017-144463	Standard Class	SC-20725	Consumer	United States	Los Angeles	California	90036	West	FUR-FU-10001215	Furniture	Furnishings	Howard Miller 11-1/2" Diameter Brentwood Wall ...	474.430	11	0.0	199.2606
2	01-01-20 \| 6683 \| CA-2017-154466 \| First Class...	01-01-20	6683	CA-2017-154466	First Class	DP-13390	Home Office	United States	Franklin	Wisconsin	53132	Central	OFF-BI-10002012	Office Supplies	Binders	Wilson Jones Easy Flow II Sheet Lifters	3.600	2	0.0	1.7280
3	01-01-20 \| 8070 \| CA-2017-151750 \| Standard Cl...	01-01-20	8070	CA-2017-151750	Standard Class	JM-15250	Consumer	United States	Huntsville	Texas	77340	Central	OFF-ST-10002743	Office Supplies	Storage	SAFCO Boltless Steel Shelving	454.560	5	0.2	-107.9580
4	01-01-20 \| 8071 \| CA-2017-151750 \| Standard Cl...	01-01-20	8071	CA-2017-151750	Standard Class	JM-15250	Consumer	United States	Huntsville	Texas	77340	Central	FUR-FU-10002116	Furniture	Furnishings	Tenex Carpeted, Granite-Look or Clear Contempo...	141.420	5	0.6	-187.3815