Auto Byte

Science AI

手把手：四色猜想、七桥问题…程序员眼里的图论，了解下？（附大量代码和手绘）

Leonhard Euler（莱昂哈德·欧拉）

Wow

```BinTreeNode<Apple>* root = new BinTreeNode<Apple>("Green");

root->left = new BinTreeNode<Apple>("Yellow");
root->right = new BinTreeNode<Apple>("Yellow 2");

BinTreeNode<Apple>* yellow_2 = root->right;

yellow_2->left = new BinTreeNode<Apple>("Almost red");
yellow_2->right = new BinTreeNode<Apple>("Red");```

Airbnb房屋查询实例：二叉搜索树

Airbnb房屋搜索截图

```// feel free to reorganize this struct to avoid redundant space
// usage because of aligning factor
// Remark 1: some of the properties could be expressed as enums,
// bitset is chosen for as multi-value enum holder.
// Remark 2: for most of the count values the maximum is 16
// Remark 3: price value considered as integer,
// int considered as 4 byte.
// Remark 4: neighborhoods property omitted
// Remark 5: to avoid spatial queries, we're
// using only country code and city name, at this point won't consider
// the actual coordinates (latitude and longitude)
struct AirbnbHome
{
wstring name; // wide string
uint price;
uchar rating;
uint rating_count;
vector<string> photos; // list of photo URLs
string host_id;
uchar children_number; // max is 5
uchar infants_number; // max is 5
bitset<3> home_type;
uchar beds_number;
uchar bedrooms_number;
uchar bathrooms_number;
bitset<21> accessibility;
bool superhost;
bitset<20> amenities;
bitset<6> facilities;
bitset<34> property_types;
bitset<32> host_languages;
bitset<3> house_rules;
ushort country_code;
string city;
};```

```bool HasWasher(AirbnbHome* h)
{
return h->amenities[2];
}```

```const int KITCHEN = 0;
const int HEATING = 1;
const int WASHER = 2;
//...
bool HasWasher(AirbnbHome* h)
{
return (h != nullptr) && h->amenities[WASHER];
}

bool HasWasherAndKitchen(AirbnbHome* h)
{
return (h != nullptr) && h->amenities[WASHER] && h->amenities[KITCHEN];
}

bool HasAllAmenities(AirbnbHome* h, const std::vector<int>& amenities)
{
bool has = (h != nullptr);
for (const auto a : amenities) {
has &= h->amenities[a];
}
return has;
}```

```// Note the comments
struct AirbnbHome
{
wstring name; // 200 bytes
uint price; // 4 bytes
uchar rating; // 1 byte
uint rating_count; // 4 bytes
vector<string> photos; // 100 bytes
string host_id; // 20 bytes
uchar children_number; // 1 byte
uchar infants_number; // 1 byte
bitset<3> home_type; // 4 bytes
uchar beds_number; // 1 byte
uchar bedrooms_number; // 1 byte
uchar bathrooms_number; // 1 byte
bitset<21> accessibility; // 4 bytes
bool superhost; // 1 byte
bitset<20> amenities; // 4 bytes
bitset<6> facilities; // 4 bytes
bitset<34> property_types; // 8 bytes
bitset<32> host_languages; // 4 bytes, correct me if I'm wrong
bitset<3> house_rules; // 4 bytes
ushort country_code; // 2 bytes
string city; // 50 bytes
};```

420字节加上之前提到的额外32字节，总共452字节的容量，考虑到有时候你或许会受困于一些校准因素，所以让我们把容量调到500字节。那么每条房屋信息占用500字节内存，对于包含所有房屋信息的列表（有时候我们会混淆列表数量和真实房屋数量，如果我犯错了请告诉我），占用内存约500字节＊4百万＝1.86GB，近似2GB，这个结果看起来很合理。

“计算”算法复杂度其实就是在操作数和输入大小之间找到一个依赖关系，上面那个例子中数组有100个元素，操作数也是100次，如果数组元素个数（输入）增加到1423个，找到任意指定元素的操作数也增加到1423次（最坏的情况下）。

```struct NTreeNode
{
T value;
vector<NTreeNode*> children;
};```

```// almost pseudocode
class NTree
{
public:
void Insert(const T&);
void Remove(const T&);
// lines of omitted code
private:
NTreeNode* root_;
};```

```struct GraphNode

{

T value;

};```

```class ConnectedGraph
{
public:
// API

private:
GraphNode* root_;```

```class DisconnectedGraphOrJustAGraph
{
public:
// API

private:
std::vector<GraphNode*> all_roots_;
};```

```// A representation of a graph with both vertex and edge tables
// Vertex table is a hashtable of edges (mapped by label)
// Edge table is a structure with 4 fields
// VELO = Vertex Edge Label Only (e.g. no vertex payloads)

class ConnectedVELOGraph {
public:
struct Edge {
Edge(const std::string& f, const std::string& t)
: from(f)
, to(t)
, used(false)
, next(nullptr)
{}
std::string ToString() {
return (from + " - " + to + " [used:" + (used ? "true" : "false") + "]");
}

std::string from;
std::string to;
bool used;
Edge* next;
};

ConnectedVELOGraph() {}
~ConnectedVELOGraph() {
vertices_.clear();
for (std::size_t ix = 0; ix < edges_.size(); ++ix) {
delete edges_[ix];
}
}

public:
void InsertEdge(const std::string& from, const std::string& to) {
Edge* e = new Edge(from, to);
InsertVertexEdge_(from, e);
InsertVertexEdge_(to, e);
edges_.push_back(e);
}

public:
void Print() {
for (auto elem : edges_) {
std::cout << elem->ToString() << std::endl;
}
}

std::vector<std::string> Trace(const std::string& v) {
std::vector<std::string> path;
Edge* e = vertices_[v];
while (e != nullptr) {
if (e->used) {
e = e->next;
} else {
e->used = true;
path.push_back(e->from + ":-:" + e->to);
e = vertices_[e->to];
}
}
return path;
}

private:
void InsertVertexEdge_(const std::string& label, Edge* e) {
if (vertices_.count(label) == 0) {
vertices_[label] = e;
} else {
vertices_[label]->next = e;
}
}

private:
std::unordered_map<std::string, Edge*> vertices_;
std::vector<Edge*> edges_;
};```

```// 'author' represents the User object, at this point we are interested only in author.id
//
// 'tw' is a Tweet object, at this point we are interested only in 'tw.id'

void DeliverATweet(User* author, Tweet* tw)
{
// we assume that 'tw' object is already stored in a database

// 1. Get the list of user's followers (author of the tweet)
vector<User*> user_followers = GetUserFollowers(author->id);

// 2. insert tweet into each timeline
for (auto follower : user_followers) {
InsertTweetIntoUserTimeline(follower->id, tw->id);
}
}```

```// Warning: a bunch of pseudocode ahead

void RangeInsertIntoTimelines(vector<long> user_ids, long tweet_id)
{
for (auto id : user_ids) {
InsertIntoUserTimeline(id, tweet_id);
}
}

void DeliverATweet(User* author, Tweet* tw)
{
// we assume that 'tw' object is already stored in a database

// 1. Get the list of user's (tweet author's) followers's ids
vector<long> user_followers = GetUserFollowers(author->id);

// 2. Insert tweet into each timeline in parallel
const int CHUNK_SIZE = 4000; // saw this somewhere
for (each CHUNK_SIZE elements in user_followers) {
t.Run(RangeInsertIntoTimelines, current_chunk, tw->id);
}
}```

`house_type: "entire_place",adults_number: 2,price_range_start: 56,price_range_end: 80,beds_number: 2,amenities: ["tv", "wifi", "laptop friendly workspace"],facilities: ["gym"]`

Hopcroft-Karp算法是种以一个二分图作为输入，并产生一个最大基数匹配（maximum cardinality matching）的算法。 最大基数匹配得出的结果是一组由尽可能多的边组成的集合，而集合中所有的边都不会共享节点。

Hopcroft-Karp算法（以及其他许多算法）都是同时基于DFS（深度优先搜索）和BFS（广度优先搜索）图遍历算法。

```// struct ListNode {
//   ListNode* next;
//   T item;
// };

void TraverseRecursive(ListNode* node) // starting node, most commonly the list 'head'
{
if (!node) return; // stop
std::cout << node->item;
TraverseRecursive(node->next); // recursive call
}

void TraverseIterative(ListNode* node)
{
while (node) {
std::cout << node->item;
node = node->next;
}
}```

• 先打印目前节点的值，然后打印相连的左子节点，最后是右子节点；

• 先打印相连的左子节点，然后打印目前节点的值，最后是右子节点；

• 先打印相连的左子节点，然后打印右子节点的值，最后是目前节点。

```// struct TreeNode {
//   T item;
//   TreeNode* left;
//   TreeNode* right;
// }

// you cann pass a callback function to do whatever you want to do with the node's value
// in this particular example we are just printing its value.

// node is the "starting point", basically the first call is done with the "root" node
void PreOrderTraverse(TreeNode* node)
{
if (!node) return; // stop
std::cout << node->item;
PreOrderTraverse(node->left); // do the same for the left sub-tree
PreOrderTraverse(node->right); // do the same for the right sub-tree
}

void InOrderTraverse(TreeNode* node)
{
if (!node) return; // stop
InOrderTraverse(node->left);
std::cout << node->item;
InOrderTraverse(node->right);
}

void PostOrderTraverse(TreeNode* node)
{
if (!node) return; // stop
PostOrderTraverse(node->left);
PostOrderTraverse(node->right);
std::cout << node->item;```

显然，递归函数看起来非常优雅，虽然它们的运行时间一般都很长。 每次递归调用函数时，这意味着我们需要调用一个完全新的的函数（参见上图）。

“新”的意思是，我们应该给函数参数和局部变量分配另一个堆栈存储空间。 这就是为什么递归调用在资源上显得很昂贵（额外的堆栈空间分配和多个函数调用）而且比较危险（需要注意堆栈溢出）。

Netflix和亚马逊个性化推荐实例：倒排索引

“KISK”或““let’s keep it simple”或“for the sake of simplicity”，这简直就是教程编写者从真实问题中找一些抽象化的简单的例子，并以伪代码形式来讲解的超级借口。这些事例的解决方案简单到往往在祖父母一辈的电脑上都能运行。

Netflix里有不少影片，亚马逊里有很多商品，在这里我们把影片和商品叫做“物品”，所以每当你查找“物品”时，可以联想Netflix中的电影或亚马逊的任何合适的产品。

```// Cached representation of an Item
// Full Item object (with title, description, comments etc.)
// could be fetched from the database
struct Item
{
// ID_TYPE is the type of Item's unique id, might be an integer, or a string
ID_TYPE id;
int rating;
};```

```// this is a pseudocode, that's why I didn't bother with "const&"'s and "std::"'s
// though it could have look better, forgive me C++ fellows

vector<Item*> GetItemsByKeywordInSortedOrder(string keyword)
{
// assuming IndexTable is a big hashtable mapping keywords to Item BSTs
BST<Item*> items = IndexTable[keyword];

// suppose BST has a function InOrderProduceVector(), which creates a vector and
// inserts into it items fetched via in-order traversing the tree
vector<Item*> sorted_result = items.InOrderProduceVector();
return sorted_result;
}```

```template <typename BlaBla>
class BST
{
public:
// other code ...
vector<BlaBla*> InOrderProduceVector()
{
vector<BlaBla*> result;
result.reserve(1000); // magic number, reserving a space to avoid reallocation on inserts
InOrderProduceVectorHelper_(root_, result); // passing vector by reference
return result;
}

protected:
// takes a reference to vector
void InOrderProduceVectorHelper_(BSTNode* node, vector<BlaBla*>& destination)
{
if (!node) return;
InOrderProduceVectorHelper_(node->left, destination);
destination.push_back(node->item);
InOrderProduceVectorHelper_(node->right, destination);
}

private:
BSTNode* root_;
};```

```// Reminder: this is pseudocode, no bother with "const&", "std::" or others
// forgive me C++ fellows

template <typename BlaBla>
class BST
{
public:
// other code ...

vector<BlaBla*> ReverseInOrderProduceVector(int offset, int limit)
{
vector<BlaBla*> result;
result.reserve(limit);
// passing result vector by reference
// and passing offset and limit
ReverseInOrderProduceVectorHelper_(root_, result, offset, limit);
return result;
}

protected:
// takes a reference to vector
// skips 'offset' nodes and inserts up to 'limit' nodes
void ReverseInOrderProduceVectorHelper_(BSTNode* node, vector<BlaBla*>& destination, int offset, int limit)
{
if (!node) return;
if (limit == 0) return;
--offset; // skipping current element
ReverseInOrderProduceVectorHelper_(node->right, destination, offset, limit);
if (offset <= 0) { // if skipped enough, insert
destination.push_back(node->value);
--limit; // keep the count of insertions
}
ReverseInOrderProduceVectorHelper_(node->left, destination, offset, limit);
}

private:
BSTNode* root_;
};

// ... other possibly useful code

// this is a pseudocode, that's why I didn't bother with "const&"'s and "std::"'s
// though it could have look better, forgive me C++ fellows

vector<Item*> GetItemsByKeywordInSortedOrder(string keyword, offset, limit) // pagination using offset and limit
{
// assuming IndexTable is a big hashtable mapping keywords to Item BSTs
BST<Item*> items = IndexTable[keyword];

// suppose BST has a function InOrderProduceVector(), which creates a vector and
// inserts into it items fetched via reverse in-order traversing the tree
// to get items in descending order (starting from the highest rated item)
vector<Item*> sorted_result = items.ReverseInOrderProduceVector(offset, limit);
return sorted_result;
}```

深度优先搜索DFS和广度优先搜索BFS

DFS很像先序遍历、中序遍历、后序遍历，而BFS可以逐层遍历节点。为实现BFS，需要一个队列（数据结构）来保存图的“层次”，同时打印（访问）其“父母层”。

```// Assuming graph is connected
// and a graph node is defined by this structure
// struct GraphNode {
//   T item;
//   vector<GraphNode*> children;
// }

// WARNING: untested code
void BreadthFirstSearch(GraphNode* node) // start node
{
if (!node) return;
queue<GraphNode*> q;
q.push(node);
while (!q.empty()) {
GraphNode* cur = q.front(); // doesn't pop
q.pop();
for (auto child : cur->children) {
q.push(child);
}

// do what you want with current node
cout << cur->item;
}
}```

Uber最短路线实例

Uber有5000万乘客和700万司机，所以高效地将司机与乘客进行匹配非常重要。司机使用Uber司机专用应用来接单，乘客就用Uber应用叫车。

1.将所有节点设为未访问。设置一个包含所有未被访问节点的集合，称为未访问集合。

2. 对所有顶点的路径长度赋暂定值：将起始节点的路径长度设为0，所有其他顶点的路径长度设为无穷大。将起始节点设为当前节点。

3.对于当前节点，考虑其周围所有未访问的相邻节点，并且计算通过当前节点到它们的暂定距离。将新计算得到的暂定距离与当前分配的距离进行比较并选择较小的值然后分配。例如，如果当前节点A的距离为6，连之相邻B的长度为2，则通过A到B的距离将为6 + 2 = 8。如果B先前被标记的距离大于8，则将其更改为8.否则，则保持当前值。

4.当我们考虑完当前节点的所有相邻节点时，将当前节点标记为已访问并将其从未访问节点集中移除。被访问的节点将不会再次被查看。

5.如果目标节点已经被标记为已访问（当目标是两个特定节点之间的路径）或者未访问集合中的节点之间的最小暂定距离是无穷大时（目标完全遍历时;发生在初始节点和剩余的未访问节点之间没有连接时），将会停止。算法已经完成。

6.否则，选择未访问过的并且具有最小暂定距离的节点，把它作为新的“当前节点”，然后回到步骤3。