πŸ€–Integrating with ML/DL

This tutorial provides an example of processing raw data for AI models:

#1.-loading-data-from-database

#2.-generating-tracks

Removing Anchored Pings

#3.-interpolating-tracks

#4.-saving-into-csv

#5.-visualization-csv-on-qgis

#6.-applying-machine-learning-to-cluster-tracks

#7.-developing-a-deep-learning-model-in-keras

#8.-developing-a-deep-learning-model-in-pytorch

1. Loading Data from Database

from datetime import datetime

# database connection
dbconn = DBConn(dbpath="/home/sqlite_database_file.db") 

# generating query to extract data between given time range
qry = aisdb.DBQuery(dbconn=dbconn, callback=in_timerange,
                      start = datetime(2015, 8, 1, hour=0),
                      end=datetime(2015, 8, 3, hour=2)
                      )
rowgen_ = qry.gen_qry(verbose=True) # generating query

If you want vessel metadata

rowgen_ = qry.gen_qry(reaggregate_static=True, verbose=True)

2. Generating Tracks

tracks_ = aisdb.track_gen.TrackGen(rowgen_, decimate=False)

Removing Anchored Pings

# a common approach to remove anchored pings is 
# to observe pings with speed near to zero. 
tracks_ = aisdb.remove_pings_wrt_speed(tracks_, 0.1)

Some approaches remove pings near to shore. An example to calculate the distance is provided Distance from Shore

3. Interpolating Tracks

# Spiliting tracks into voyages based on speed and distance threshold
tracks_ = aisdb.encode_greatcircledistance(tracks_,
                                                  distance_threshold=50000,
                                                  minscore=1e-5,
                                                  speed_threshold=50)
tracks_ = aisdb.interp_time(tracks_, step=timedelta(minutes=5))

4. Saving into CSV

aisdb.write_csv(tracks_,"/home/Export_.csv")

5. Visualization CSV on QGIS

QGIS is a cross-platform desktop geographic information system application that supports viewing, editing, printing, and analysis of geospatial data.

The CSV can be imported by Menu > Layers > Add Layer > Add delimiter text layer

The tracks can be generated by Points to Path function in tools of QGIS using Track_ID as a grouping parameter.

6. Applying Machine Learning to Cluster Tracks

import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

def slope_distance(trj1, trj2):
    s1_ = calculate_regression_slope(trj1)
    distt =  s1_ - calculate_regression_slope(trj2)
    return distt
    
def calculate_regression_slope(trajectory):
    # Separate the trajectory into two lists, x and y
    x = [point[0] for point in trajectory]
    y = [point[1] for point in trajectory]

    # Perform linear regression
    slope, intercept = np.polyfit(x, y, 1)

    return slope
    
def cluster_trajectories_in_df(df, eps=0.5, min_samples=1, cluster_col_name = 'cluster', parts=10, threshold=0.8):
    # Get unique trajectory identifiers
    unique_ids = df['Track_ID'].unique()

    # Extract trajectories
    trajectories = [df[df['Track_ID'] == uid][['lon', 'lat']].values for uid in unique_ids]

    # Define a function to calculate modified Haversine distance
    def modified_haversine_distances(X):
        dists = np.zeros((len(X), len(X)))
        for i, traj1 in enumerate(X):
            for j, traj2 in enumerate(X):
                if i <= j:
                    # traj_dist = [geodesic(p1, p2).kilometers for p1, p2 in zip(traj1, traj2)]
                    # dists[i, j] = 1 - sum(d <= radius for d in traj_dist) / len(traj_dist)
                    dists[i, j] = abs(slope_distance(traj1, traj2))
                    dists[j, i] = dists[i, j]
        return dists

        # Calculate modified Haversine distances between trajectories
    dist_matrix = modified_haversine_distances(trajectories)

    # Perform DBSCAN clustering
    clusterer = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples)
    clusters = clusterer.fit_predict(dist_matrix)

    # Create a mapping of trajectory identifier to cluster label
    id_to_cluster = {uid: cluster for uid, cluster in zip(unique_ids, clusters)}

    # Add a new column to the dataframe for cluster labels
    df[cluster_col_name] = df['Track_ID'].map(id_to_cluster)
    # Return the dataframe with cluster labels
    return df
df__ = pd.read_csv("/home/Export_.csv") 
df2_ = cluster_trajectories_in_df(df__) #ajust the parameters of DBScan accordingly
df2_.to_csv("/home/grouped_tracks.csv", index=False)

7. Developing a Deep Learning Model in Keras

Reading CSV and the data transformation depends on the type of task we want to perform on Tracks. Here, we provide an example of using the CSV in a sequence-to-sequence model to predict 3 next points as output while giving the model 10 AIS messages as input.

import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import io

# lets read the file into dataframe
df__ = pd.read_csv("/home/Export_.csv") 

# Define the input sequence length and output sequence length
input_sequence_length = 10
output_sequence_length = 3 
output_feature_length = 2 # lon, lat

# Group data by Track_ID and convert each track into a sequence
sequences = []
output_sequences = []
feature_set = ['lon', 'lat', 'cog', 'heading', 'sog']
# each track is identified by 'Track_ID' thus we will group it
track_ids = data['Track_ID'].unique()

# For each track, we need to slice based on the defined input/output window.
for track_id in track_ids:
    track_data = data[data['Track_ID'] == track_id][feature_set].values
    num_points = len(track_data) # total points of this track
    
    # start slicing
    for i in range(0, num_points - input_sequence_length - output_sequence_length + 1):
        input_seq = track_data[i:i + input_sequence_length]
        output_seq = track_data[i + input_sequence_length:i + input_sequence_length + output_sequence_length]
        
        sequences.append(input_seq)
        output_sequences.append(output_seq)

# Convert sequences and output_sequences to numpy arrays
sequences = np.array(sequences)
output_sequences = np.array(output_sequences)

print(sequences.shape)  # (total samples, input_sequence_length, Features)
print(output_sequences.shape)


# Standardize the data
scaler = StandardScaler()
sequences = sequences.reshape(-1, feature_set.__len__())  # Assuming there are 5 features (lon, lat, cog, heading, sog)

sequences = scaler.fit_transform(sequences)
# Reshape sequences back to the original shape
sequences = sequences.reshape(-1, input_sequence_length, feature_set.__len__())  # 5 features (lon, lat, cog, heading, sog)

# we also need to normalize the output sequence
output_sequences = output_sequences.reshape(-1, feature_set.__len__())
output_sequences = scaler.transform(output_sequences)
output_sequences = output_sequences.reshape(-1, output_sequence_length, feature_set.__len__())

output_sequences = output_sequences[:, :, 0:2] #taking lon and lat
print(output_sequences.shape)


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(sequences, output_sequences, test_size=0.2, random_state=42)

# Define the sequence-to-sequence model using an LSTM
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(input_sequence_length, 5)))
model.add(RepeatVector(output_sequence_length))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(2)))  # 2 features (lon, lat) in the output
model.compile(optimizer='adam', loss='mse')  # You can choose a different loss function if needed

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val))

8. Developing a Deep Learning Model in Pytorch

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

# Step 1: Load CSV
def load_csv(file_path):
    return pd.read_csv(file_path)

# Step 2: Generate sequences from data
def generate_sequences(data, input_seq_len, output_seq_len, input_features):
    sequences = []
    for track_id in data['Track_ID'].unique():
        track_data = data[data['Track_ID'] == track_id]
        for i in range(len(track_data) - input_seq_len - output_seq_len + 1):
            input_seq = track_data[i:i+input_seq_len][input_features].values
            output_seq = track_data[i+input_seq_len:i+input_seq_len+output_seq_len][input_features].values
            sequences.append((input_seq, output_seq))
    return sequences

# Step 3: Normalize the data
def normalize_data(sequences, output_features_index, scaler):
    sequences_normalized = []
    for input_seq, output_seq in sequences:
        input_seq_normalized = scaler.transform(input_seq)
        output_seq_normalized = scaler.transform(output_seq)[:,output_features_index]
        sequences_normalized.append((input_seq_normalized, output_seq_normalized))
    return sequences_normalized, scaler

# Step 4: Split the data into training, test, validation
def split_data(sequences):
    train_seq, test_seq = train_test_split(sequences, test_size=0.2, random_state=42)
    train_seq, val_seq = train_test_split(train_seq, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2
    return train_seq, val_seq, test_seq

# Step 5: Model initialization
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, output_sequence_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        self.output_sequence_size = output_sequence_size
        self.num_layers = num_layers
        self.hidden_size = hidden_size


    def forward(self, x):
        batch_size, sequence_length, _ = x.size()
        output_sequence = []

        # Initialize the hidden state and cell state
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)

        for t in range(self.output_sequence_size):
            # Pass the input through the LSTM
            output_t, (hidden, cell) = self.lstm(x, (hidden, cell))

            # Get the output at the last time step
            output_t = self.linear(output_t[:, -1, :])

            # Append the output to the output_sequence
            output_sequence.append(output_t)

        # Stack the output_sequence along the sequence dimension
        output_sequence = torch.stack(output_sequence, dim=1)

        return output_sequence

# Function to create dataloaders
def create_dataloaders(sequences, batch_size):
    def to_tensor(sequences):
        inputs = torch.tensor([item[0] for item in sequences], dtype=torch.float32)
        targets = torch.tensor([item[1] for item in sequences], dtype=torch.float32)
        return inputs, targets

    inputs, targets = to_tensor(sequences)
    dataset = TensorDataset(inputs, targets)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Step 6: Training the model in batches
def train_model(model, dataloaders, epochs, learning_rate):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    for epoch in range(epochs):
        model.train()
        for inputs, targets in dataloaders['train']:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        # Validation step
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for inputs, targets in dataloaders['val']:
                outputs = model(inputs)
                val_loss += criterion(outputs, targets).item()
            print(f'Epoch {epoch+1}/{epochs}, Training Loss: {loss.item():.4f}, Validation Loss: {val_loss/len(dataloaders["val"]):.4f}')

# Step 7: Get prediction from model
def predict(model, dataloader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            outputs = model(inputs)
            predictions.append(outputs.numpy())
    return np.concatenate(predictions)

# Step 8: Save prediction into csv
def save_predictions_to_csv(predictions, scaler, file_path):
    predictions_denorm = scaler.inverse_transform(predictions)
    df = pd.DataFrame(predictions_denorm, columns=['lat', 'lon'])
    df.to_csv(file_path, index=False)

# Example usage
data = pd.read_csv("/home/_Export.csv")
input_features = ['lat', 'lon', 'cog', 'sog', 'heading']
output_features_index = [0, 1] # index of lat and lon in input_features
input_seq_len = 10 
output_seq_len = 3
scaler = MinMaxScaler(feature_range=(0,1)) # setting normalization between 0 to 1
scaler.fit(data[input_features].to_numpy())
sequences = generate_sequences(data, input_seq_len, output_seq_len, input_features)
sequences_normalized, scaler = normalize_data(sequences, output_features_index, scaler=scaler)


train_seq, val_seq, test_seq = split_data(sequences_normalized)

# Model parameters
input_size = len(input_features)  # Number of input features
output_size = len(output_features_index)  # Number of output features
hidden_size = 50  # You can adjust this
num_layers = 2  # You can adjust this
batch_size = 64  # You can adjust this
epochs = 10  # You can adjust this
learning_rate = 0.001  # You can adjust this

# Create dataloaders
dataloaders = {
    'train': create_dataloaders(train_seq, batch_size),
    'val': create_dataloaders(val_seq, batch_size),
    'test': create_dataloaders(test_seq, batch_size)
}

# Initialize model
model = LSTMModel(input_size, hidden_size, output_size, num_layers, output_sequence_size=output_seq_len)

# Train the model
train_model(model, dataloaders, epochs, learning_rate)

# Get predictions from the model
test_predictions = predict(model, dataloaders['test'])

Last updated