Loading Data from Database
from datetime import datetime
# database connection
dbconn = DBConn(dbpath="/home/sqlite_database_file.db")
# Generating query to extract data between a given time range
qry = aisdb.DBQuery(dbconn=dbconn, callback=in_timerange,
start = datetime(2015, 8, 1, hour=0),
end=datetime(2015, 8, 3, hour=2)
)
rowgen_ = qry.gen_qry(verbose=True) # generating query
If you want vessel metadata
rowgen_ = qry.gen_qry(reaggregate_static=True, verbose=True)
tracks_ = aisdb.track_gen.TrackGen(rowgen_, decimate=False)
# A common approach to removing anchored pings is to observe pings with speed near zero.
tracks_ = aisdb.remove_pings_wrt_speed(tracks_, 0.1)
Some approaches remove pings near the shore. An example to calculate the distance is provided: Distance from Shore
Interpolating Tracks
# Spiliting tracks into voyages based on speed and distance threshold
tracks_ = aisdb.encode_greatcircledistance(tracks_,
distance_threshold=50000,
minscore=1e-5,
speed_threshold=50)
tracks_ = aisdb.interp_time(tracks_, step=timedelta(minutes=5))
Saving into CSV
aisdb.write_csv(tracks_,"/home/Export_.csv")
QGIS is a cross-platform desktop geographic information system application that supports viewing, editing, printing, and analyzing geospatial data.
The CSV can be imported by Menu > Layers > Add Layer > Add delimiter text layer
The tracks can be generated by Points to Path
a function in QGIS tools using Track_ID as a grouping parameter.
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
def slope_distance(trj1, trj2):
s1_ = calculate_regression_slope(trj1)
distt = s1_ - calculate_regression_slope(trj2)
return distt
def calculate_regression_slope(trajectory):
# Separate the trajectory into two lists, x and y
x = [point[0] for point in trajectory]
y = [point[1] for point in trajectory]
# Perform linear regression
slope, intercept = np.polyfit(x, y, 1)
return slope
def cluster_trajectories_in_df(df, eps=0.5, min_samples=1, cluster_col_name = 'cluster', parts=10, threshold=0.8):
# Get unique trajectory identifiers
unique_ids = df['Track_ID'].unique()
# Extract trajectories
trajectories = [df[df['Track_ID'] == uid][['lon', 'lat']].values for uid in unique_ids]
# Define a function to calculate modified Haversine distance
def modified_haversine_distances(X):
dists = np.zeros((len(X), len(X)))
for i, traj1 in enumerate(X):
for j, traj2 in enumerate(X):
if i <= j:
# traj_dist = [geodesic(p1, p2).kilometers for p1, p2 in zip(traj1, traj2)]
# dists[i, j] = 1 - sum(d <= radius for d in traj_dist) / len(traj_dist)
dists[i, j] = abs(slope_distance(traj1, traj2))
dists[j, i] = dists[i, j]
return dists
# Calculate modified Haversine distances between trajectories
dist_matrix = modified_haversine_distances(trajectories)
# Perform DBSCAN clustering
clusterer = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples)
clusters = clusterer.fit_predict(dist_matrix)
# Create a mapping of trajectory identifiers to cluster label
id_to_cluster = {uid: cluster for uid, cluster in zip(unique_ids, clusters)}
# Add a new column to the data frame for cluster labels
df[cluster_col_name] = df['Track_ID'].map(id_to_cluster)
# Return the data frame with cluster labels
return df
df__ = pd.read_csv("/home/Export_.csv")
df2_ = cluster_trajectories_in_df(df__) #ajust the parameters of DBScan accordingly
df2_.to_csv("/home/grouped_tracks.csv", index=False)
Reading CSV and the data transformation depends on the type of task we want to perform on Tracks. Here, we provide an example of using the CSV in a sequence-to-sequence model to predict 3 next points as output while giving the model 10 AIS messages as input.
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import io
# lets read the file into data frame
df__ = pd.read_csv("/home/Export_.csv")
# Define the input sequence length and output sequence length
input_sequence_length = 10
output_sequence_length = 3
output_feature_length = 2 # lon, lat
# Group data by Track_ID and convert each track into a sequence
sequences = []
output_sequences = []
feature_set = ['lon', 'lat', 'cog', 'heading', 'sog']
# Each track is identified by 'Track_ID' thus, we will group it
track_ids = data['Track_ID'].unique()
# For each track, we need to slice based on the defined input/output window.
for track_id in track_ids:
track_data = data[data['Track_ID'] == track_id][feature_set].values
num_points = len(track_data) # total points of this track
# start slicing
for i in range(0, num_points - input_sequence_length - output_sequence_length + 1):
input_seq = track_data[i:i + input_sequence_length]
output_seq = track_data[i + input_sequence_length:i + input_sequence_length + output_sequence_length]
sequences.append(input_seq)
output_sequences.append(output_seq)
# Convert sequences and output_sequences to numpy arrays
sequences = np.array(sequences)
output_sequences = np.array(output_sequences)
print(sequences.shape) # (total samples, input_sequence_length, Features)
print(output_sequences.shape)
# Standardize the data
scaler = StandardScaler()
sequences = sequences.reshape(-1, feature_set.__len__()) # Assuming there are 5 features (lon, lat, cog, heading, sog)
sequences = scaler.fit_transform(sequences)
# Reshape sequences back to the original shape
sequences = sequences.reshape(-1, input_sequence_length, feature_set.__len__()) # 5 features (lon, lat, cog, heading, sog)
# we also need to normalize the output sequence
output_sequences = output_sequences.reshape(-1, feature_set.__len__())
output_sequences = scaler.transform(output_sequences)
output_sequences = output_sequences.reshape(-1, output_sequence_length, feature_set.__len__())
output_sequences = output_sequences[:, :, 0:2] #taking lon and lat
print(output_sequences.shape)
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(sequences, output_sequences, test_size=0.2, random_state=42)
# Define the sequence-to-sequence model using an LSTM
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(input_sequence_length, 5)))
model.add(RepeatVector(output_sequence_length))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(2))) # 2 features (lon, lat) in the output
model.compile(optimizer='adam', loss='mse') # You can choose a different loss function if needed
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val))
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
# Step 1: Load CSV
def load_csv(file_path):
return pd.read_csv(file_path)
# Step 2: Generate sequences from data
def generate_sequences(data, input_seq_len, output_seq_len, input_features):
sequences = []
for track_id in data['Track_ID'].unique():
track_data = data[data['Track_ID'] == track_id]
for i in range(len(track_data) - input_seq_len - output_seq_len + 1):
input_seq = track_data[i:i+input_seq_len][input_features].values
output_seq = track_data[i+input_seq_len:i+input_seq_len+output_seq_len][input_features].values
sequences.append((input_seq, output_seq))
return sequences
# Step 3: Normalize the data
def normalize_data(sequences, output_features_index, scaler):
sequences_normalized = []
for input_seq, output_seq in sequences:
input_seq_normalized = scaler.transform(input_seq)
output_seq_normalized = scaler.transform(output_seq)[:,output_features_index]
sequences_normalized.append((input_seq_normalized, output_seq_normalized))
return sequences_normalized, scaler
# Step 4: Split the data into training, test, and validation
def split_data(sequences):
train_seq, test_seq = train_test_split(sequences, test_size=0.2, random_state=42)
train_seq, val_seq = train_test_split(train_seq, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2
return train_seq, val_seq, test_seq
# Step 5: Model initialization
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers, output_sequence_size):
super(LSTMModel, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.linear = nn.Linear(hidden_size, output_size)
self.output_sequence_size = output_sequence_size
self.num_layers = num_layers
self.hidden_size = hidden_size
def forward(self, x):
batch_size, sequence_length, _ = x.size()
output_sequence = []
# Initialize the hidden state and cell state
hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
for t in range(self.output_sequence_size):
# Pass the input through the LSTM
output_t, (hidden, cell) = self.lstm(x, (hidden, cell))
# Get the output at the last time step
output_t = self.linear(output_t[:, -1, :])
# Append the output to the output_sequence
output_sequence.append(output_t)
# Stack the output_sequence along the sequence dimension
output_sequence = torch.stack(output_sequence, dim=1)
return output_sequence
# Function to create data loaders
def create_dataloaders(sequences, batch_size):
def to_tensor(sequences):
inputs = torch.tensor([item[0] for item in sequences], dtype=torch.float32)
targets = torch.tensor([item[1] for item in sequences], dtype=torch.float32)
return inputs, targets
inputs, targets = to_tensor(sequences)
dataset = TensorDataset(inputs, targets)
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Step 6: Training the model in batches
def train_model(model, dataloaders, epochs, learning_rate):
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
for epoch in range(epochs):
model.train()
for inputs, targets in dataloaders['train']:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
# Validation step
model.eval()
with torch.no_grad():
val_loss = 0
for inputs, targets in dataloaders['val']:
outputs = model(inputs)
val_loss += criterion(outputs, targets).item()
print(f'Epoch {epoch+1}/{epochs}, Training Loss: {loss.item():.4f}, Validation Loss: {val_loss/len(dataloaders["val"]):.4f}')
# Step 7: Get a prediction from the model
def predict(model, dataloader):
model.eval()
predictions = []
with torch.no_grad():
for inputs, _ in dataloader:
outputs = model(inputs)
predictions.append(outputs.numpy())
return np.concatenate(predictions)
# Step 8: Save prediction into CSV
def save_predictions_to_csv(predictions, scaler, file_path):
predictions_denorm = scaler.inverse_transform(predictions)
df = pd.DataFrame(predictions_denorm, columns=['lat', 'lon'])
df.to_csv(file_path, index=False)
# Example usage
data = pd.read_csv("/home/_Export.csv")
input_features = ['lat', 'lon', 'cog', 'sog', 'heading']
output_features_index = [0, 1] # index of lat and lon in input_features
input_seq_len = 10
output_seq_len = 3
scaler = MinMaxScaler(feature_range=(0,1)) # setting normalization between 0 to 1
scaler.fit(data[input_features].to_numpy())
sequences = generate_sequences(data, input_seq_len, output_seq_len, input_features)
sequences_normalized, scaler = normalize_data(sequences, output_features_index, scaler=scaler)
train_seq, val_seq, test_seq = split_data(sequences_normalized)
# Model parameters
input_size = len(input_features) # Number of input features
output_size = len(output_features_index) # Number of output features
hidden_size = 50 # You can adjust this
num_layers = 2 # You can adjust this
batch_size = 64 # You can adjust this
epochs = 10 # You can adjust this
learning_rate = 0.001 # You can adjust this
# Create data loaders
dataloaders = {
'train': create_dataloaders(train_seq, batch_size),
'val': create_dataloaders(val_seq, batch_size),
'test': create_dataloaders(test_seq, batch_size)
}
# Initialize model
model = LSTMModel(input_size, hidden_size, output_size, num_layers, output_sequence_size=output_seq_len)
# Train the model
train_model(model, dataloaders, epochs, learning_rate)
# Get predictions from the model
test_predictions = predict(model, dataloaders['test'])